{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "e51cd3d8-2474-489b-ba3a-f3aaeb829024",
   "metadata": {},
   "source": [
    "# 导包并定义函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "9fcc4b7a-04e2-41b9-b5cd-f032e794c895",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-11T02:46:38.432892Z",
     "iopub.status.busy": "2024-11-11T02:46:38.432032Z",
     "iopub.status.idle": "2024-11-11T02:46:41.460539Z",
     "msg_id": "6e1d517e-ccd2-4595-a6f1-ea54f0af923f",
     "shell.execute_reply": "2024-11-11T02:46:41.459766Z",
     "shell.execute_reply.started": "2024-11-11T02:46:38.432858Z"
    }
   },
   "outputs": [],
   "source": [
    "run A榜复现_导包并定义函数.ipynb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9c74c401-7384-47a3-b63e-27ae75a10a5e",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-11T02:46:42.493288Z",
     "iopub.status.busy": "2024-11-11T02:46:42.492547Z",
     "iopub.status.idle": "2024-11-11T02:46:42.655068Z",
     "msg_id": "2d18e3c3-07c1-4d3b-aad1-8ad36c7b5c96",
     "shell.execute_reply": "2024-11-11T02:46:42.654319Z",
     "shell.execute_reply.started": "2024-11-11T02:46:42.493254Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "os.system('rm -rf tmp')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "15cc67d1-b427-4777-a247-0c927caabb2e",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-11T02:46:44.189749Z",
     "iopub.status.busy": "2024-11-11T02:46:44.189235Z",
     "iopub.status.idle": "2024-11-11T02:46:44.194041Z",
     "msg_id": "918d3179-e8ff-4cd9-ab5f-b4229e194282",
     "shell.execute_reply": "2024-11-11T02:46:44.193370Z",
     "shell.execute_reply.started": "2024-11-11T02:46:44.189713Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "os.makedirs(\"tmp\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f268dd26-967a-46e8-8111-1c3108529c43",
   "metadata": {},
   "source": [
    "# 数据读取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "3bdbc853-e9f6-4572-be55-955f40f20a1f",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-11T02:46:47.374556Z",
     "iopub.status.busy": "2024-11-11T02:46:47.374077Z",
     "iopub.status.idle": "2024-11-11T02:46:47.684831Z",
     "msg_id": "5f31d056-ff6e-4893-b5f4-968bdddf201d",
     "shell.execute_reply": "2024-11-11T02:46:47.684151Z",
     "shell.execute_reply.started": "2024-11-11T02:46:47.374526Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(59079, 12)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>数据日期</th>\n",
       "      <th>客户编号</th>\n",
       "      <th>经营期限至</th>\n",
       "      <th>经营期限自</th>\n",
       "      <th>经营状态</th>\n",
       "      <th>注册资本</th>\n",
       "      <th>成立日期</th>\n",
       "      <th>法定代表人/负责人/执行事务合伙人</th>\n",
       "      <th>企业（机构）类型编码</th>\n",
       "      <th>所在省份编码</th>\n",
       "      <th>国民经济行业代码</th>\n",
       "      <th>is_train</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>20020727</td>\n",
       "      <td>182d6a854532dd26a1b111e77bd501f4</td>\n",
       "      <td>20420701</td>\n",
       "      <td>19920702</td>\n",
       "      <td>在营（开业）</td>\n",
       "      <td>275.77</td>\n",
       "      <td>19920702</td>\n",
       "      <td>444360f253c09f7d97a3b15bb26a8573</td>\n",
       "      <td>46f6ddc7a540fa9e2b5ac3fa24038304</td>\n",
       "      <td>3048e339d9689928fc83eb6aa552ccfb</td>\n",
       "      <td>1407ccc0c9f66ff0402271dada75885e</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>20020727</td>\n",
       "      <td>f60def7aa5dc124ddae552b7bf5c7675</td>\n",
       "      <td>长期</td>\n",
       "      <td>19930512</td>\n",
       "      <td>在营（开业）</td>\n",
       "      <td>218.88</td>\n",
       "      <td>19930512</td>\n",
       "      <td>0dbf92ddf037b7727060924e76284d1c</td>\n",
       "      <td>46f6ddc7a540fa9e2b5ac3fa24038304</td>\n",
       "      <td>181b1987746f41b780200a407686ffc5</td>\n",
       "      <td>d5f34ccd26f45e66747884462c45c309</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       数据日期                              客户编号     经营期限至     经营期限自    经营状态  \\\n",
       "0  20020727  182d6a854532dd26a1b111e77bd501f4  20420701  19920702  在营（开业）   \n",
       "1  20020727  f60def7aa5dc124ddae552b7bf5c7675        长期  19930512  在营（开业）   \n",
       "\n",
       "     注册资本      成立日期                 法定代表人/负责人/执行事务合伙人  \\\n",
       "0  275.77  19920702  444360f253c09f7d97a3b15bb26a8573   \n",
       "1  218.88  19930512  0dbf92ddf037b7727060924e76284d1c   \n",
       "\n",
       "                         企业（机构）类型编码                            所在省份编码  \\\n",
       "0  46f6ddc7a540fa9e2b5ac3fa24038304  3048e339d9689928fc83eb6aa552ccfb   \n",
       "1  46f6ddc7a540fa9e2b5ac3fa24038304  181b1987746f41b780200a407686ffc5   \n",
       "\n",
       "                           国民经济行业代码  is_train  \n",
       "0  1407ccc0c9f66ff0402271dada75885e         1  \n",
       "1  d5f34ccd26f45e66747884462c45c309         1  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "file_name = 'XW_ENTINFO_BASIC'\n",
    "BASIC = get_data(file_name, num_rows=None)\n",
    "print(BASIC.shape)\n",
    "BASIC.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "eac62717-8b22-4bcc-8f24-24e1457afdea",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-11T02:46:50.818324Z",
     "iopub.status.busy": "2024-11-11T02:46:50.817825Z",
     "iopub.status.idle": "2024-11-11T02:46:50.896795Z",
     "msg_id": "51156d60-083a-45e0-9262-381b475a3632",
     "shell.execute_reply": "2024-11-11T02:46:50.896150Z",
     "shell.execute_reply.started": "2024-11-11T02:46:50.818290Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(59079, 4)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>数据日期</th>\n",
       "      <th>客户编号</th>\n",
       "      <th>FLAG</th>\n",
       "      <th>is_train</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>20030728</td>\n",
       "      <td>158a8d99bec2a2b652a6de45a2b52ec9</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>20030728</td>\n",
       "      <td>b1d244a25a82adb7beafe33fe971402c</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       数据日期                              客户编号  FLAG  is_train\n",
       "0  20030728  158a8d99bec2a2b652a6de45a2b52ec9   0.0         1\n",
       "1  20030728  b1d244a25a82adb7beafe33fe971402c   0.0         1"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "file_name = 'XW_ENTINFO_TARGET'\n",
    "TARGET = get_data(file_name, num_rows=None)\n",
    "print(TARGET.shape)\n",
    "TARGET.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "3671cde4-6610-4109-a59a-71f0c1234433",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-11T02:46:54.693442Z",
     "iopub.status.busy": "2024-11-11T02:46:54.692966Z",
     "iopub.status.idle": "2024-11-11T02:46:54.701998Z",
     "msg_id": "0be30afe-fabc-4c49-a035-4b59d91a326e",
     "shell.execute_reply": "2024-11-11T02:46:54.701358Z",
     "shell.execute_reply.started": "2024-11-11T02:46:54.693411Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    53094\n",
       "0     5985\n",
       "Name: is_train, dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "TARGET['is_train'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0476de3a-f075-43ba-a648-c5a27aeb640c",
   "metadata": {},
   "source": [
    "# 基本信息表业务特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "4e32e624-0e8b-413e-920c-070877754e86",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-11T02:47:02.230423Z",
     "iopub.status.busy": "2024-11-11T02:47:02.229699Z",
     "iopub.status.idle": "2024-11-11T02:47:02.264708Z",
     "msg_id": "767f6efb-cd8b-48be-a2a1-bcf35e06fd95",
     "shell.execute_reply": "2024-11-11T02:47:02.264023Z",
     "shell.execute_reply.started": "2024-11-11T02:47:02.230392Z"
    }
   },
   "outputs": [],
   "source": [
    "def BASIC_info():\n",
    "    # 经营期限至：将长期转化为29991231\n",
    "    file_name = 'XW_ENTINFO_BASIC'\n",
    "    BASIC = get_data(file_name, num_rows=None)\n",
    "    BASIC.columns = ['数据日期', '客户编号', '经营期限至', '经营期限自', '经营状态', '注册资本', '成立日期','法定代表人', '企业（机构）类型编码', '所在省份编码', '国民经济行业代码', 'is_train']\n",
    "    data = BASIC.copy()\n",
    "    data['是否长期经营']            = np.where((data['经营期限至'] == '长期'),1,0)\n",
    "    data['经营成立时间是否相等']    = np.where((data['经营期限自'] == data['成立日期']),1,0)\n",
    "    data['注册资金过小']            = np.where((data['注册资本'] <= 47.16),1,0)\n",
    "    data['经营期限至']              = data['经营期限至'].apply(lambda x: 29991231 if x == '长期' else x).astype(int)\n",
    "    data['经营是否已过期']          = data['经营期限至'].apply(lambda x: 1 if x < 20020727 else 0)\n",
    "    #按天\n",
    "    data['剩余经营天数_天']            = data[['经营期限至', '数据日期']].apply(lambda x: two_date_dis(x[0], x[1])[0], axis = 1)  \n",
    "    data['已经营天数_天']              = data[['数据日期', '经营期限自']].apply(lambda x: two_date_dis(x[0], x[1])[0], axis = 1)                \n",
    "    data['当期经营期限总天数_天']      = data[['经营期限至', '经营期限自']].apply(lambda x: two_date_dis(x[0], x[1])[0], axis = 1) \n",
    "    data['自成立经营期限总天数_天']    = data[['经营期限至', '成立日期']].apply(lambda x: two_date_dis(x[0], x[1])[0], axis = 1) \n",
    "    data['已成立天数_天']              = data[['数据日期', '成立日期']].apply(lambda x: two_date_dis(x[0], x[1])[0], axis = 1)    #成立日期\n",
    "    data['再次经营_天']                = data[['经营期限自', '成立日期']].apply(lambda x: two_date_dis(x[0], x[1])[0], axis = 1)\n",
    "    # #按月\n",
    "    data['剩余经营天数_月']            = data[['经营期限至', '数据日期']].apply(lambda x: two_date_dis(x[0], x[1])[1], axis = 1)  \n",
    "    data['已经营天数_月']              = data[['数据日期', '经营期限自']].apply(lambda x: two_date_dis(x[0], x[1])[1], axis = 1)                \n",
    "    data['当期经营期限总天数_月']      = data[['经营期限至', '经营期限自']].apply(lambda x: two_date_dis(x[0], x[1])[1], axis = 1) \n",
    "    data['自成立经营期限总天数_月']    = data[['经营期限至', '成立日期']].apply(lambda x: two_date_dis(x[0], x[1])[1], axis = 1) \n",
    "    data['已成立天数_月']              = data[['数据日期', '成立日期']].apply(lambda x: two_date_dis(x[0], x[1])[1], axis = 1)    #成立日期\n",
    "    data['再次经营_月']                = data[['经营期限自', '成立日期']].apply(lambda x: two_date_dis(x[0], x[1])[1], axis = 1)\n",
    "    #按年\n",
    "    data['剩余经营天数_年']            = data[['经营期限至', '数据日期']].apply(lambda x: two_date_dis(x[0], x[1])[2], axis = 1)  \n",
    "    data['已经营天数_年']              = data[['数据日期', '经营期限自']].apply(lambda x: two_date_dis(x[0], x[1])[2], axis = 1)                \n",
    "    data['当期经营期限总天数_年']      = data[['经营期限至', '经营期限自']].apply(lambda x: two_date_dis(x[0], x[1])[2], axis = 1) \n",
    "    data['自成立经营期限总天数_年']    = data[['经营期限至', '成立日期']].apply(lambda x: two_date_dis(x[0], x[1])[2], axis = 1) \n",
    "    data['已成立天数_年']              = data[['数据日期', '成立日期']].apply(lambda x: two_date_dis(x[0], x[1])[2], axis = 1)    #成立日期\n",
    "    data['再次经营_年']                = data[['经营期限自', '成立日期']].apply(lambda x: two_date_dis(x[0], x[1])[2], axis = 1)\n",
    "    #经营状态 训练集中，迁出坏率大于在营坏率大于注销吊销\n",
    "    ENTSTATUS_dict = {'迁出':0, '在营（开业）':1, '注销':2, '吊销，未注销':3}\n",
    "    data['经营状态_编码'] = data['经营状态'].map(ENTSTATUS_dict)\n",
    "    #注册资本\n",
    "    data['注册资本'] = pow((data['注册资本'])/3.12,3).round(2)\n",
    "    #企业类型编码\n",
    "    a = data['企业（机构）类型编码'].value_counts()\n",
    "    head_1 = a.head(10).index.tolist()\n",
    "    tail_1 = a.tail(20).index.tolist()\n",
    "    data[\"企业（机构）类型编码_频数是否前10\"] = data[\"企业（机构）类型编码\"].apply(lambda x: 1 if x in head_1 else 0)\n",
    "    data[\"企业（机构）类型编码_频数是否后20\"] = data[\"企业（机构）类型编码\"].apply(lambda x: 1 if x in tail_1 else 0)\n",
    "    data[\"企业（机构）类型编码_是否频数最高2类\"] = data[\"企业（机构）类型编码\"].apply(lambda x: 1 if x == '46f6ddc7a540fa9e2b5ac3fa24038304' or x == 'd6c937931560c340740515da55cfabb5' else 0)\n",
    "    #省份编码\n",
    "    a = data['所在省份编码'].value_counts()\n",
    "    head_1 = a.head(5).index.tolist()\n",
    "    tail_1 = a.tail(5).index.tolist()\n",
    "    data[\"所在省份编码_频数是否前5\"] = data[\"所在省份编码\"].apply(lambda x: 1 if x in head_1 else 0)\n",
    "    data[\"所在省份编码_频数是否后5\"] = data[\"所在省份编码\"].apply(lambda x: 1 if x in tail_1 else 0)\n",
    "    data[\"企业（机构）类型编码_是否坏率最高2类\"] = data[\"所在省份编码\"].apply(lambda x: 1 if x == 'c3e3dd960e9608c4fc446fe4de09943a' or x == '787f623759f116bd7c5ffdee4bed4a02' else 0)\n",
    "    #国民经济行业代码\n",
    "    a = data['国民经济行业代码'].value_counts()\n",
    "    head_1 = a.head(5).index.tolist()\n",
    "    tail_1 = a.tail(230).index.tolist()\n",
    "    data[\"国民经济行业代码_频数是否前5\"]   = data[\"国民经济行业代码\"].apply(lambda x: 1 if x in head_1 else 0)\n",
    "    data[\"国民经济行业代码_频数是否后230\"] = data[\"国民经济行业代码\"].apply(lambda x: 1 if x in tail_1 else 0)\n",
    "    \n",
    "    #分箱（由于数据泄露，最后没有使用）\n",
    "    data['企业（机构）类型编码_分箱'] = data['企业（机构）类型编码']\n",
    "    data['所在省份编码_分箱'] = data['所在省份编码']\n",
    "    data['国民经济行业代码_分箱'] = data['国民经济行业代码']\n",
    "    \n",
    "    file_name = 'XW_ENTINFO_TARGET'\n",
    "    TARGET = get_data(file_name, num_rows=None)\n",
    "    TARGET_train = TARGET[TARGET['is_train'] == 1]\n",
    "    BASIC_train = data[data['is_train'] == 1]\n",
    "    BASE_train_TARGET = TARGET_train.drop(['is_train', '数据日期'], axis = 1).merge(BASIC_train[['客户编号', '企业（机构）类型编码_分箱', '所在省份编码_分箱', '国民经济行业代码_分箱']], on = '客户编号', how = 'inner')\n",
    "    \n",
    "    BASE_train_TARGET = BASE_train_TARGET.drop(['客户编号'], axis = 1)\n",
    "    \n",
    "    #分箱\n",
    "    c = toad.transform.Combiner()\n",
    "    c.fit(BASE_train_TARGET, y = 'FLAG', method = 'chi', min_samples = 0.02, empty_separate = False)\n",
    "    data = c.transform(data, labels=False)\n",
    "    \n",
    "    #法定代表人\n",
    "    a = data.groupby('法定代表人').agg({'客户编号':'count', '企业（机构）类型编码': 'nunique', '国民经济行业代码': 'nunique', '所在省份编码': 'nunique'})                                    \n",
    "    a = a.reset_index()\n",
    "    a.columns = ['法定代表人', '法定代表人相关企业个数', '法人涉足企业类型', '法人涉足国民经济行业代码', '法人跨省个数']\n",
    "    data = data.merge(a, how = 'left', on = '法定代表人')\n",
    "    data = data.drop(['数据日期', 'is_train', '经营期限至', '经营期限自', '成立日期', '法定代表人', '经营状态', '企业（机构）类型编码', '所在省份编码', '国民经济行业代码'], axis = 1)\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "fb95e6ab-075f-42ef-bcf7-6864b0cd9d7b",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-11T02:47:03.718337Z",
     "iopub.status.busy": "2024-11-11T02:47:03.717857Z",
     "iopub.status.idle": "2024-11-11T02:47:22.730366Z",
     "msg_id": "e6e7416f-c29f-4bee-a3d5-3d4137ca13d6",
     "shell.execute_reply": "2024-11-11T02:47:22.729664Z",
     "shell.execute_reply.started": "2024-11-11T02:47:03.718304Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(59079, 40)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>客户编号</th>\n",
       "      <th>注册资本</th>\n",
       "      <th>是否长期经营</th>\n",
       "      <th>经营成立时间是否相等</th>\n",
       "      <th>注册资金过小</th>\n",
       "      <th>经营是否已过期</th>\n",
       "      <th>剩余经营天数_天</th>\n",
       "      <th>已经营天数_天</th>\n",
       "      <th>当期经营期限总天数_天</th>\n",
       "      <th>自成立经营期限总天数_天</th>\n",
       "      <th>已成立天数_天</th>\n",
       "      <th>再次经营_天</th>\n",
       "      <th>剩余经营天数_月</th>\n",
       "      <th>已经营天数_月</th>\n",
       "      <th>当期经营期限总天数_月</th>\n",
       "      <th>自成立经营期限总天数_月</th>\n",
       "      <th>已成立天数_月</th>\n",
       "      <th>再次经营_月</th>\n",
       "      <th>剩余经营天数_年</th>\n",
       "      <th>已经营天数_年</th>\n",
       "      <th>当期经营期限总天数_年</th>\n",
       "      <th>自成立经营期限总天数_年</th>\n",
       "      <th>已成立天数_年</th>\n",
       "      <th>再次经营_年</th>\n",
       "      <th>经营状态_编码</th>\n",
       "      <th>企业（机构）类型编码_频数是否前10</th>\n",
       "      <th>企业（机构）类型编码_频数是否后20</th>\n",
       "      <th>企业（机构）类型编码_是否频数最高2类</th>\n",
       "      <th>所在省份编码_频数是否前5</th>\n",
       "      <th>所在省份编码_频数是否后5</th>\n",
       "      <th>企业（机构）类型编码_是否坏率最高2类</th>\n",
       "      <th>国民经济行业代码_频数是否前5</th>\n",
       "      <th>国民经济行业代码_频数是否后230</th>\n",
       "      <th>企业（机构）类型编码_分箱</th>\n",
       "      <th>所在省份编码_分箱</th>\n",
       "      <th>国民经济行业代码_分箱</th>\n",
       "      <th>法定代表人相关企业个数</th>\n",
       "      <th>法人涉足企业类型</th>\n",
       "      <th>法人涉足国民经济行业代码</th>\n",
       "      <th>法人跨省个数</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>182d6a854532dd26a1b111e77bd501f4</td>\n",
       "      <td>690521.61</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>14574</td>\n",
       "      <td>3675</td>\n",
       "      <td>18249</td>\n",
       "      <td>18249</td>\n",
       "      <td>3675</td>\n",
       "      <td>0</td>\n",
       "      <td>480</td>\n",
       "      <td>120</td>\n",
       "      <td>600</td>\n",
       "      <td>600</td>\n",
       "      <td>120</td>\n",
       "      <td>0</td>\n",
       "      <td>40.000000</td>\n",
       "      <td>10.000000</td>\n",
       "      <td>50.000000</td>\n",
       "      <td>50.000000</td>\n",
       "      <td>10.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>f60def7aa5dc124ddae552b7bf5c7675</td>\n",
       "      <td>345266.51</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>364059</td>\n",
       "      <td>3360</td>\n",
       "      <td>367419</td>\n",
       "      <td>367419</td>\n",
       "      <td>3360</td>\n",
       "      <td>0</td>\n",
       "      <td>11969</td>\n",
       "      <td>110</td>\n",
       "      <td>12079</td>\n",
       "      <td>12079</td>\n",
       "      <td>110</td>\n",
       "      <td>0</td>\n",
       "      <td>997.416667</td>\n",
       "      <td>9.166667</td>\n",
       "      <td>1006.583333</td>\n",
       "      <td>1006.583333</td>\n",
       "      <td>9.166667</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               客户编号       注册资本  是否长期经营  经营成立时间是否相等  注册资金过小  \\\n",
       "0  182d6a854532dd26a1b111e77bd501f4  690521.61       0           1       0   \n",
       "1  f60def7aa5dc124ddae552b7bf5c7675  345266.51       1           1       0   \n",
       "\n",
       "   经营是否已过期  剩余经营天数_天  已经营天数_天  当期经营期限总天数_天  自成立经营期限总天数_天  已成立天数_天  再次经营_天  \\\n",
       "0        0     14574     3675        18249         18249     3675       0   \n",
       "1        0    364059     3360       367419        367419     3360       0   \n",
       "\n",
       "   剩余经营天数_月  已经营天数_月  当期经营期限总天数_月  自成立经营期限总天数_月  已成立天数_月  再次经营_月    剩余经营天数_年  \\\n",
       "0       480      120          600           600      120       0   40.000000   \n",
       "1     11969      110        12079         12079      110       0  997.416667   \n",
       "\n",
       "     已经营天数_年  当期经营期限总天数_年  自成立经营期限总天数_年    已成立天数_年  再次经营_年  经营状态_编码  \\\n",
       "0  10.000000    50.000000     50.000000  10.000000     0.0        1   \n",
       "1   9.166667  1006.583333   1006.583333   9.166667     0.0        1   \n",
       "\n",
       "   企业（机构）类型编码_频数是否前10  企业（机构）类型编码_频数是否后20  企业（机构）类型编码_是否频数最高2类  所在省份编码_频数是否前5  \\\n",
       "0                   1                   0                    1              0   \n",
       "1                   1                   0                    1              0   \n",
       "\n",
       "   所在省份编码_频数是否后5  企业（机构）类型编码_是否坏率最高2类  国民经济行业代码_频数是否前5  国民经济行业代码_频数是否后230  \\\n",
       "0              0                    0                0                  0   \n",
       "1              0                    0                0                  0   \n",
       "\n",
       "   企业（机构）类型编码_分箱  所在省份编码_分箱  国民经济行业代码_分箱  法定代表人相关企业个数  法人涉足企业类型  法人涉足国民经济行业代码  \\\n",
       "0              1          3            2            1         1             1   \n",
       "1              1          3            4            1         1             1   \n",
       "\n",
       "   法人跨省个数  \n",
       "0       1  \n",
       "1       1  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "basic_info = BASIC_info()\n",
    "print(basic_info.shape)\n",
    "basic_info.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "71f03b96-2668-441e-8d99-4df9a87a06e4",
   "metadata": {},
   "source": [
    "# 基本信息表文本特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "c1f1bbd0-2df8-48e3-a1e7-689326422525",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-11T02:47:22.732128Z",
     "iopub.status.busy": "2024-11-11T02:47:22.731768Z",
     "iopub.status.idle": "2024-11-11T02:47:22.739228Z",
     "msg_id": "7b806a58-3292-4d09-824f-2f75cf8846dd",
     "shell.execute_reply": "2024-11-11T02:47:22.738477Z",
     "shell.execute_reply.started": "2024-11-11T02:47:22.732099Z"
    }
   },
   "outputs": [],
   "source": [
    "def BASIC_text():\n",
    "    file_name = 'XW_ENTINFO_BASIC'\n",
    "    BASIC = get_data(file_name, num_rows=None)\n",
    "    BASIC.columns = ['数据日期', '客户编号', '经营期限至', '经营期限自', '经营状态', '注册资本', '成立日期','法定代表人', '企业（机构）类型编码', '所在省份编码', '国民经济行业代码', 'is_train']\n",
    "    file_name = 'XW_ENTINFO_TARGET'\n",
    "    TARGET = get_data(file_name, num_rows=None)\n",
    "    TARGET = TARGET.drop(['数据日期'], axis = 1)\n",
    "    for i in ['法定代表人', '企业（机构）类型编码', '所在省份编码', '国民经济行业代码']:\n",
    "        tmp = text_feats(BASIC, '客户编号', i, num=10)\n",
    "        TARGET = TARGET.merge(tmp, on = '客户编号', how = 'left')\n",
    "    for i in ['法定代表人', '企业（机构）类型编码', '所在省份编码', '国民经济行业代码']:\n",
    "        tmp = word2vec_feature(BASIC, '客户编号', i, ext=\"A\")\n",
    "        TARGET = TARGET.merge(tmp, on = '客户编号', how = 'left')\n",
    "    TARGET = TARGET.drop(['is_train', 'FLAG'], axis = 1)\n",
    "    return TARGET"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "6e4aa703-8e52-456c-ad69-ca06a98b463c",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-11T02:47:22.740454Z",
     "iopub.status.busy": "2024-11-11T02:47:22.740070Z",
     "iopub.status.idle": "2024-11-11T02:47:53.666615Z",
     "msg_id": "5a1002df-bf16-473e-916c-60ea88e66bd1",
     "shell.execute_reply": "2024-11-11T02:47:53.665922Z",
     "shell.execute_reply.started": "2024-11-11T02:47:22.740428Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(59079, 113)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>客户编号</th>\n",
       "      <th>法定代表人_tfidf_0</th>\n",
       "      <th>法定代表人_tfidf_1</th>\n",
       "      <th>法定代表人_tfidf_2</th>\n",
       "      <th>法定代表人_tfidf_3</th>\n",
       "      <th>法定代表人_tfidf_4</th>\n",
       "      <th>法定代表人_tfidf_5</th>\n",
       "      <th>法定代表人_tfidf_6</th>\n",
       "      <th>法定代表人_tfidf_7</th>\n",
       "      <th>法定代表人_tfidf_8</th>\n",
       "      <th>法定代表人_tfidf_9</th>\n",
       "      <th>法定代表人_countvec_0</th>\n",
       "      <th>法定代表人_countvec_1</th>\n",
       "      <th>法定代表人_countvec_2</th>\n",
       "      <th>法定代表人_countvec_3</th>\n",
       "      <th>法定代表人_countvec_4</th>\n",
       "      <th>法定代表人_countvec_5</th>\n",
       "      <th>法定代表人_countvec_6</th>\n",
       "      <th>法定代表人_countvec_7</th>\n",
       "      <th>法定代表人_countvec_8</th>\n",
       "      <th>法定代表人_countvec_9</th>\n",
       "      <th>企业（机构）类型编码_tfidf_0</th>\n",
       "      <th>企业（机构）类型编码_tfidf_1</th>\n",
       "      <th>企业（机构）类型编码_tfidf_2</th>\n",
       "      <th>企业（机构）类型编码_tfidf_3</th>\n",
       "      <th>企业（机构）类型编码_tfidf_4</th>\n",
       "      <th>企业（机构）类型编码_tfidf_5</th>\n",
       "      <th>企业（机构）类型编码_tfidf_6</th>\n",
       "      <th>企业（机构）类型编码_tfidf_7</th>\n",
       "      <th>企业（机构）类型编码_tfidf_8</th>\n",
       "      <th>企业（机构）类型编码_tfidf_9</th>\n",
       "      <th>企业（机构）类型编码_countvec_0</th>\n",
       "      <th>企业（机构）类型编码_countvec_1</th>\n",
       "      <th>企业（机构）类型编码_countvec_2</th>\n",
       "      <th>企业（机构）类型编码_countvec_3</th>\n",
       "      <th>企业（机构）类型编码_countvec_4</th>\n",
       "      <th>企业（机构）类型编码_countvec_5</th>\n",
       "      <th>企业（机构）类型编码_countvec_6</th>\n",
       "      <th>企业（机构）类型编码_countvec_7</th>\n",
       "      <th>企业（机构）类型编码_countvec_8</th>\n",
       "      <th>企业（机构）类型编码_countvec_9</th>\n",
       "      <th>所在省份编码_tfidf_0</th>\n",
       "      <th>所在省份编码_tfidf_1</th>\n",
       "      <th>所在省份编码_tfidf_2</th>\n",
       "      <th>所在省份编码_tfidf_3</th>\n",
       "      <th>所在省份编码_tfidf_4</th>\n",
       "      <th>所在省份编码_tfidf_5</th>\n",
       "      <th>所在省份编码_tfidf_6</th>\n",
       "      <th>所在省份编码_tfidf_7</th>\n",
       "      <th>所在省份编码_tfidf_8</th>\n",
       "      <th>所在省份编码_tfidf_9</th>\n",
       "      <th>所在省份编码_countvec_0</th>\n",
       "      <th>所在省份编码_countvec_1</th>\n",
       "      <th>所在省份编码_countvec_2</th>\n",
       "      <th>所在省份编码_countvec_3</th>\n",
       "      <th>所在省份编码_countvec_4</th>\n",
       "      <th>所在省份编码_countvec_5</th>\n",
       "      <th>所在省份编码_countvec_6</th>\n",
       "      <th>所在省份编码_countvec_7</th>\n",
       "      <th>所在省份编码_countvec_8</th>\n",
       "      <th>所在省份编码_countvec_9</th>\n",
       "      <th>国民经济行业代码_tfidf_0</th>\n",
       "      <th>国民经济行业代码_tfidf_1</th>\n",
       "      <th>国民经济行业代码_tfidf_2</th>\n",
       "      <th>国民经济行业代码_tfidf_3</th>\n",
       "      <th>国民经济行业代码_tfidf_4</th>\n",
       "      <th>国民经济行业代码_tfidf_5</th>\n",
       "      <th>国民经济行业代码_tfidf_6</th>\n",
       "      <th>国民经济行业代码_tfidf_7</th>\n",
       "      <th>国民经济行业代码_tfidf_8</th>\n",
       "      <th>国民经济行业代码_tfidf_9</th>\n",
       "      <th>国民经济行业代码_countvec_0</th>\n",
       "      <th>国民经济行业代码_countvec_1</th>\n",
       "      <th>国民经济行业代码_countvec_2</th>\n",
       "      <th>国民经济行业代码_countvec_3</th>\n",
       "      <th>国民经济行业代码_countvec_4</th>\n",
       "      <th>国民经济行业代码_countvec_5</th>\n",
       "      <th>国民经济行业代码_countvec_6</th>\n",
       "      <th>国民经济行业代码_countvec_7</th>\n",
       "      <th>国民经济行业代码_countvec_8</th>\n",
       "      <th>国民经济行业代码_countvec_9</th>\n",
       "      <th>客户编号_法定代表人_w2v_0</th>\n",
       "      <th>客户编号_法定代表人_w2v_1</th>\n",
       "      <th>客户编号_法定代表人_w2v_2</th>\n",
       "      <th>客户编号_法定代表人_w2v_3</th>\n",
       "      <th>客户编号_法定代表人_w2v_4</th>\n",
       "      <th>客户编号_法定代表人_w2v_5</th>\n",
       "      <th>客户编号_法定代表人_w2v_6</th>\n",
       "      <th>客户编号_法定代表人_w2v_7</th>\n",
       "      <th>客户编号_企业（机构）类型编码_w2v_0</th>\n",
       "      <th>客户编号_企业（机构）类型编码_w2v_1</th>\n",
       "      <th>客户编号_企业（机构）类型编码_w2v_2</th>\n",
       "      <th>客户编号_企业（机构）类型编码_w2v_3</th>\n",
       "      <th>客户编号_企业（机构）类型编码_w2v_4</th>\n",
       "      <th>客户编号_企业（机构）类型编码_w2v_5</th>\n",
       "      <th>客户编号_企业（机构）类型编码_w2v_6</th>\n",
       "      <th>客户编号_企业（机构）类型编码_w2v_7</th>\n",
       "      <th>客户编号_所在省份编码_w2v_0</th>\n",
       "      <th>客户编号_所在省份编码_w2v_1</th>\n",
       "      <th>客户编号_所在省份编码_w2v_2</th>\n",
       "      <th>客户编号_所在省份编码_w2v_3</th>\n",
       "      <th>客户编号_所在省份编码_w2v_4</th>\n",
       "      <th>客户编号_所在省份编码_w2v_5</th>\n",
       "      <th>客户编号_所在省份编码_w2v_6</th>\n",
       "      <th>客户编号_所在省份编码_w2v_7</th>\n",
       "      <th>客户编号_国民经济行业代码_w2v_0</th>\n",
       "      <th>客户编号_国民经济行业代码_w2v_1</th>\n",
       "      <th>客户编号_国民经济行业代码_w2v_2</th>\n",
       "      <th>客户编号_国民经济行业代码_w2v_3</th>\n",
       "      <th>客户编号_国民经济行业代码_w2v_4</th>\n",
       "      <th>客户编号_国民经济行业代码_w2v_5</th>\n",
       "      <th>客户编号_国民经济行业代码_w2v_6</th>\n",
       "      <th>客户编号_国民经济行业代码_w2v_7</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>158a8d99bec2a2b652a6de45a2b52ec9</td>\n",
       "      <td>-6.950766e-34</td>\n",
       "      <td>3.573632e-33</td>\n",
       "      <td>1.800550e-32</td>\n",
       "      <td>8.583294e-31</td>\n",
       "      <td>-3.078939e-31</td>\n",
       "      <td>1.478094e-30</td>\n",
       "      <td>1.217316e-29</td>\n",
       "      <td>7.504761e-31</td>\n",
       "      <td>1.203415e-29</td>\n",
       "      <td>1.054062e-31</td>\n",
       "      <td>-6.950766e-34</td>\n",
       "      <td>3.573632e-33</td>\n",
       "      <td>1.800550e-32</td>\n",
       "      <td>8.583294e-31</td>\n",
       "      <td>-3.078939e-31</td>\n",
       "      <td>1.478094e-30</td>\n",
       "      <td>1.217316e-29</td>\n",
       "      <td>7.504761e-31</td>\n",
       "      <td>1.203415e-29</td>\n",
       "      <td>1.054062e-31</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>-7.412313e-18</td>\n",
       "      <td>-2.602485e-21</td>\n",
       "      <td>-8.368757e-29</td>\n",
       "      <td>9.075137e-33</td>\n",
       "      <td>9.408616e-35</td>\n",
       "      <td>-2.943408e-33</td>\n",
       "      <td>-2.544040e-33</td>\n",
       "      <td>-1.943142e-37</td>\n",
       "      <td>9.602446e-37</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>-7.412313e-18</td>\n",
       "      <td>-2.602485e-21</td>\n",
       "      <td>-8.368757e-29</td>\n",
       "      <td>9.075137e-33</td>\n",
       "      <td>9.408616e-35</td>\n",
       "      <td>-2.943408e-33</td>\n",
       "      <td>-2.544040e-33</td>\n",
       "      <td>-1.943142e-37</td>\n",
       "      <td>9.602446e-37</td>\n",
       "      <td>-1.340637e-19</td>\n",
       "      <td>-7.838735e-20</td>\n",
       "      <td>1.522786e-20</td>\n",
       "      <td>1.880774e-19</td>\n",
       "      <td>-6.580866e-19</td>\n",
       "      <td>2.021743e-17</td>\n",
       "      <td>3.884970e-17</td>\n",
       "      <td>-2.207025e-15</td>\n",
       "      <td>1.145948e-15</td>\n",
       "      <td>2.174522e-13</td>\n",
       "      <td>-1.340637e-19</td>\n",
       "      <td>-7.838735e-20</td>\n",
       "      <td>1.522786e-20</td>\n",
       "      <td>1.880774e-19</td>\n",
       "      <td>-6.580866e-19</td>\n",
       "      <td>2.021743e-17</td>\n",
       "      <td>3.884970e-17</td>\n",
       "      <td>-2.207025e-15</td>\n",
       "      <td>1.145948e-15</td>\n",
       "      <td>2.174522e-13</td>\n",
       "      <td>3.997757e-23</td>\n",
       "      <td>-2.733747e-20</td>\n",
       "      <td>-1.840303e-20</td>\n",
       "      <td>-5.564197e-17</td>\n",
       "      <td>3.478159e-16</td>\n",
       "      <td>-9.815886e-15</td>\n",
       "      <td>-3.377545e-15</td>\n",
       "      <td>-7.009257e-16</td>\n",
       "      <td>2.493787e-14</td>\n",
       "      <td>4.229538e-13</td>\n",
       "      <td>3.997757e-23</td>\n",
       "      <td>-2.733747e-20</td>\n",
       "      <td>-1.840303e-20</td>\n",
       "      <td>-5.564197e-17</td>\n",
       "      <td>3.478159e-16</td>\n",
       "      <td>-9.815886e-15</td>\n",
       "      <td>-3.377545e-15</td>\n",
       "      <td>-7.009257e-16</td>\n",
       "      <td>2.493787e-14</td>\n",
       "      <td>4.229538e-13</td>\n",
       "      <td>0.090405</td>\n",
       "      <td>-0.036603</td>\n",
       "      <td>-0.026244</td>\n",
       "      <td>0.117404</td>\n",
       "      <td>0.090062</td>\n",
       "      <td>0.032500</td>\n",
       "      <td>-0.101215</td>\n",
       "      <td>-0.094746</td>\n",
       "      <td>-0.102687</td>\n",
       "      <td>0.068489</td>\n",
       "      <td>0.038643</td>\n",
       "      <td>-0.015280</td>\n",
       "      <td>-0.016746</td>\n",
       "      <td>0.089649</td>\n",
       "      <td>-0.103514</td>\n",
       "      <td>0.049342</td>\n",
       "      <td>0.110077</td>\n",
       "      <td>-0.015712</td>\n",
       "      <td>-0.084828</td>\n",
       "      <td>0.08317</td>\n",
       "      <td>0.032441</td>\n",
       "      <td>0.050066</td>\n",
       "      <td>-0.100693</td>\n",
       "      <td>-0.046908</td>\n",
       "      <td>-0.095276</td>\n",
       "      <td>-0.031454</td>\n",
       "      <td>0.106618</td>\n",
       "      <td>-0.018529</td>\n",
       "      <td>-0.096568</td>\n",
       "      <td>0.037983</td>\n",
       "      <td>0.115654</td>\n",
       "      <td>0.091873</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>b1d244a25a82adb7beafe33fe971402c</td>\n",
       "      <td>-3.340791e-33</td>\n",
       "      <td>-1.077927e-32</td>\n",
       "      <td>-2.584227e-32</td>\n",
       "      <td>-1.179680e-30</td>\n",
       "      <td>-9.008725e-32</td>\n",
       "      <td>-1.870315e-30</td>\n",
       "      <td>-4.261865e-30</td>\n",
       "      <td>-2.029240e-30</td>\n",
       "      <td>-1.689734e-30</td>\n",
       "      <td>3.322348e-29</td>\n",
       "      <td>-3.340791e-33</td>\n",
       "      <td>-1.077927e-32</td>\n",
       "      <td>-2.584227e-32</td>\n",
       "      <td>-1.179680e-30</td>\n",
       "      <td>-9.008725e-32</td>\n",
       "      <td>-1.870315e-30</td>\n",
       "      <td>-4.261865e-30</td>\n",
       "      <td>-2.029240e-30</td>\n",
       "      <td>-1.689734e-30</td>\n",
       "      <td>3.322348e-29</td>\n",
       "      <td>7.410764e-18</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>3.740093e-16</td>\n",
       "      <td>1.141216e-23</td>\n",
       "      <td>-1.226520e-27</td>\n",
       "      <td>-1.474954e-29</td>\n",
       "      <td>2.286179e-28</td>\n",
       "      <td>1.975235e-28</td>\n",
       "      <td>1.508660e-32</td>\n",
       "      <td>-7.455314e-32</td>\n",
       "      <td>7.410764e-18</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>3.740093e-16</td>\n",
       "      <td>1.141216e-23</td>\n",
       "      <td>-1.226520e-27</td>\n",
       "      <td>-1.474954e-29</td>\n",
       "      <td>2.286179e-28</td>\n",
       "      <td>1.975235e-28</td>\n",
       "      <td>1.508660e-32</td>\n",
       "      <td>-7.455314e-32</td>\n",
       "      <td>-1.161965e-21</td>\n",
       "      <td>2.961595e-20</td>\n",
       "      <td>-1.582304e-16</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>-9.528525e-15</td>\n",
       "      <td>2.614597e-14</td>\n",
       "      <td>9.991529e-16</td>\n",
       "      <td>-1.666323e-17</td>\n",
       "      <td>-1.370716e-17</td>\n",
       "      <td>-2.082196e-18</td>\n",
       "      <td>-1.161965e-21</td>\n",
       "      <td>2.961595e-20</td>\n",
       "      <td>-1.582304e-16</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>-9.528525e-15</td>\n",
       "      <td>2.614597e-14</td>\n",
       "      <td>9.991529e-16</td>\n",
       "      <td>-1.666323e-17</td>\n",
       "      <td>-1.370716e-17</td>\n",
       "      <td>-2.082196e-18</td>\n",
       "      <td>3.791771e-28</td>\n",
       "      <td>-2.829735e-25</td>\n",
       "      <td>2.656799e-25</td>\n",
       "      <td>-1.070390e-21</td>\n",
       "      <td>2.190595e-21</td>\n",
       "      <td>1.524752e-20</td>\n",
       "      <td>-2.091852e-19</td>\n",
       "      <td>3.367610e-19</td>\n",
       "      <td>3.195152e-19</td>\n",
       "      <td>2.705752e-18</td>\n",
       "      <td>3.791771e-28</td>\n",
       "      <td>-2.829735e-25</td>\n",
       "      <td>2.656799e-25</td>\n",
       "      <td>-1.070390e-21</td>\n",
       "      <td>2.190595e-21</td>\n",
       "      <td>1.524752e-20</td>\n",
       "      <td>-2.091852e-19</td>\n",
       "      <td>3.367610e-19</td>\n",
       "      <td>3.195152e-19</td>\n",
       "      <td>2.705752e-18</td>\n",
       "      <td>-0.011832</td>\n",
       "      <td>-0.075686</td>\n",
       "      <td>0.113298</td>\n",
       "      <td>-0.095627</td>\n",
       "      <td>-0.028781</td>\n",
       "      <td>0.091717</td>\n",
       "      <td>-0.034111</td>\n",
       "      <td>0.121524</td>\n",
       "      <td>-0.074633</td>\n",
       "      <td>-0.101456</td>\n",
       "      <td>0.006620</td>\n",
       "      <td>0.118906</td>\n",
       "      <td>0.058938</td>\n",
       "      <td>0.065285</td>\n",
       "      <td>0.054369</td>\n",
       "      <td>0.071516</td>\n",
       "      <td>0.070392</td>\n",
       "      <td>0.035966</td>\n",
       "      <td>-0.024396</td>\n",
       "      <td>0.08069</td>\n",
       "      <td>0.011357</td>\n",
       "      <td>-0.014146</td>\n",
       "      <td>-0.012385</td>\n",
       "      <td>-0.068190</td>\n",
       "      <td>-0.075757</td>\n",
       "      <td>-0.017145</td>\n",
       "      <td>0.007416</td>\n",
       "      <td>0.031503</td>\n",
       "      <td>0.006594</td>\n",
       "      <td>-0.034836</td>\n",
       "      <td>0.115443</td>\n",
       "      <td>0.003185</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               客户编号  法定代表人_tfidf_0  法定代表人_tfidf_1  \\\n",
       "0  158a8d99bec2a2b652a6de45a2b52ec9  -6.950766e-34   3.573632e-33   \n",
       "1  b1d244a25a82adb7beafe33fe971402c  -3.340791e-33  -1.077927e-32   \n",
       "\n",
       "   法定代表人_tfidf_2  法定代表人_tfidf_3  法定代表人_tfidf_4  法定代表人_tfidf_5  法定代表人_tfidf_6  \\\n",
       "0   1.800550e-32   8.583294e-31  -3.078939e-31   1.478094e-30   1.217316e-29   \n",
       "1  -2.584227e-32  -1.179680e-30  -9.008725e-32  -1.870315e-30  -4.261865e-30   \n",
       "\n",
       "   法定代表人_tfidf_7  法定代表人_tfidf_8  法定代表人_tfidf_9  法定代表人_countvec_0  \\\n",
       "0   7.504761e-31   1.203415e-29   1.054062e-31     -6.950766e-34   \n",
       "1  -2.029240e-30  -1.689734e-30   3.322348e-29     -3.340791e-33   \n",
       "\n",
       "   法定代表人_countvec_1  法定代表人_countvec_2  法定代表人_countvec_3  法定代表人_countvec_4  \\\n",
       "0      3.573632e-33      1.800550e-32      8.583294e-31     -3.078939e-31   \n",
       "1     -1.077927e-32     -2.584227e-32     -1.179680e-30     -9.008725e-32   \n",
       "\n",
       "   法定代表人_countvec_5  法定代表人_countvec_6  法定代表人_countvec_7  法定代表人_countvec_8  \\\n",
       "0      1.478094e-30      1.217316e-29      7.504761e-31      1.203415e-29   \n",
       "1     -1.870315e-30     -4.261865e-30     -2.029240e-30     -1.689734e-30   \n",
       "\n",
       "   法定代表人_countvec_9  企业（机构）类型编码_tfidf_0  企业（机构）类型编码_tfidf_1  \\\n",
       "0      1.054062e-31        1.000000e+00       -7.412313e-18   \n",
       "1      3.322348e-29        7.410764e-18        1.000000e+00   \n",
       "\n",
       "   企业（机构）类型编码_tfidf_2  企业（机构）类型编码_tfidf_3  企业（机构）类型编码_tfidf_4  \\\n",
       "0       -2.602485e-21       -8.368757e-29        9.075137e-33   \n",
       "1        3.740093e-16        1.141216e-23       -1.226520e-27   \n",
       "\n",
       "   企业（机构）类型编码_tfidf_5  企业（机构）类型编码_tfidf_6  企业（机构）类型编码_tfidf_7  \\\n",
       "0        9.408616e-35       -2.943408e-33       -2.544040e-33   \n",
       "1       -1.474954e-29        2.286179e-28        1.975235e-28   \n",
       "\n",
       "   企业（机构）类型编码_tfidf_8  企业（机构）类型编码_tfidf_9  企业（机构）类型编码_countvec_0  \\\n",
       "0       -1.943142e-37        9.602446e-37           1.000000e+00   \n",
       "1        1.508660e-32       -7.455314e-32           7.410764e-18   \n",
       "\n",
       "   企业（机构）类型编码_countvec_1  企业（机构）类型编码_countvec_2  企业（机构）类型编码_countvec_3  \\\n",
       "0          -7.412313e-18          -2.602485e-21          -8.368757e-29   \n",
       "1           1.000000e+00           3.740093e-16           1.141216e-23   \n",
       "\n",
       "   企业（机构）类型编码_countvec_4  企业（机构）类型编码_countvec_5  企业（机构）类型编码_countvec_6  \\\n",
       "0           9.075137e-33           9.408616e-35          -2.943408e-33   \n",
       "1          -1.226520e-27          -1.474954e-29           2.286179e-28   \n",
       "\n",
       "   企业（机构）类型编码_countvec_7  企业（机构）类型编码_countvec_8  企业（机构）类型编码_countvec_9  \\\n",
       "0          -2.544040e-33          -1.943142e-37           9.602446e-37   \n",
       "1           1.975235e-28           1.508660e-32          -7.455314e-32   \n",
       "\n",
       "   所在省份编码_tfidf_0  所在省份编码_tfidf_1  所在省份编码_tfidf_2  所在省份编码_tfidf_3  \\\n",
       "0   -1.340637e-19   -7.838735e-20    1.522786e-20    1.880774e-19   \n",
       "1   -1.161965e-21    2.961595e-20   -1.582304e-16    1.000000e+00   \n",
       "\n",
       "   所在省份编码_tfidf_4  所在省份编码_tfidf_5  所在省份编码_tfidf_6  所在省份编码_tfidf_7  \\\n",
       "0   -6.580866e-19    2.021743e-17    3.884970e-17   -2.207025e-15   \n",
       "1   -9.528525e-15    2.614597e-14    9.991529e-16   -1.666323e-17   \n",
       "\n",
       "   所在省份编码_tfidf_8  所在省份编码_tfidf_9  所在省份编码_countvec_0  所在省份编码_countvec_1  \\\n",
       "0    1.145948e-15    2.174522e-13      -1.340637e-19      -7.838735e-20   \n",
       "1   -1.370716e-17   -2.082196e-18      -1.161965e-21       2.961595e-20   \n",
       "\n",
       "   所在省份编码_countvec_2  所在省份编码_countvec_3  所在省份编码_countvec_4  所在省份编码_countvec_5  \\\n",
       "0       1.522786e-20       1.880774e-19      -6.580866e-19       2.021743e-17   \n",
       "1      -1.582304e-16       1.000000e+00      -9.528525e-15       2.614597e-14   \n",
       "\n",
       "   所在省份编码_countvec_6  所在省份编码_countvec_7  所在省份编码_countvec_8  所在省份编码_countvec_9  \\\n",
       "0       3.884970e-17      -2.207025e-15       1.145948e-15       2.174522e-13   \n",
       "1       9.991529e-16      -1.666323e-17      -1.370716e-17      -2.082196e-18   \n",
       "\n",
       "   国民经济行业代码_tfidf_0  国民经济行业代码_tfidf_1  国民经济行业代码_tfidf_2  国民经济行业代码_tfidf_3  \\\n",
       "0      3.997757e-23     -2.733747e-20     -1.840303e-20     -5.564197e-17   \n",
       "1      3.791771e-28     -2.829735e-25      2.656799e-25     -1.070390e-21   \n",
       "\n",
       "   国民经济行业代码_tfidf_4  国民经济行业代码_tfidf_5  国民经济行业代码_tfidf_6  国民经济行业代码_tfidf_7  \\\n",
       "0      3.478159e-16     -9.815886e-15     -3.377545e-15     -7.009257e-16   \n",
       "1      2.190595e-21      1.524752e-20     -2.091852e-19      3.367610e-19   \n",
       "\n",
       "   国民经济行业代码_tfidf_8  国民经济行业代码_tfidf_9  国民经济行业代码_countvec_0  \\\n",
       "0      2.493787e-14      4.229538e-13         3.997757e-23   \n",
       "1      3.195152e-19      2.705752e-18         3.791771e-28   \n",
       "\n",
       "   国民经济行业代码_countvec_1  国民经济行业代码_countvec_2  国民经济行业代码_countvec_3  \\\n",
       "0        -2.733747e-20        -1.840303e-20        -5.564197e-17   \n",
       "1        -2.829735e-25         2.656799e-25        -1.070390e-21   \n",
       "\n",
       "   国民经济行业代码_countvec_4  国民经济行业代码_countvec_5  国民经济行业代码_countvec_6  \\\n",
       "0         3.478159e-16        -9.815886e-15        -3.377545e-15   \n",
       "1         2.190595e-21         1.524752e-20        -2.091852e-19   \n",
       "\n",
       "   国民经济行业代码_countvec_7  国民经济行业代码_countvec_8  国民经济行业代码_countvec_9  \\\n",
       "0        -7.009257e-16         2.493787e-14         4.229538e-13   \n",
       "1         3.367610e-19         3.195152e-19         2.705752e-18   \n",
       "\n",
       "   客户编号_法定代表人_w2v_0  客户编号_法定代表人_w2v_1  客户编号_法定代表人_w2v_2  客户编号_法定代表人_w2v_3  \\\n",
       "0          0.090405         -0.036603         -0.026244          0.117404   \n",
       "1         -0.011832         -0.075686          0.113298         -0.095627   \n",
       "\n",
       "   客户编号_法定代表人_w2v_4  客户编号_法定代表人_w2v_5  客户编号_法定代表人_w2v_6  客户编号_法定代表人_w2v_7  \\\n",
       "0          0.090062          0.032500         -0.101215         -0.094746   \n",
       "1         -0.028781          0.091717         -0.034111          0.121524   \n",
       "\n",
       "   客户编号_企业（机构）类型编码_w2v_0  客户编号_企业（机构）类型编码_w2v_1  客户编号_企业（机构）类型编码_w2v_2  \\\n",
       "0              -0.102687               0.068489               0.038643   \n",
       "1              -0.074633              -0.101456               0.006620   \n",
       "\n",
       "   客户编号_企业（机构）类型编码_w2v_3  客户编号_企业（机构）类型编码_w2v_4  客户编号_企业（机构）类型编码_w2v_5  \\\n",
       "0              -0.015280              -0.016746               0.089649   \n",
       "1               0.118906               0.058938               0.065285   \n",
       "\n",
       "   客户编号_企业（机构）类型编码_w2v_6  客户编号_企业（机构）类型编码_w2v_7  客户编号_所在省份编码_w2v_0  \\\n",
       "0              -0.103514               0.049342           0.110077   \n",
       "1               0.054369               0.071516           0.070392   \n",
       "\n",
       "   客户编号_所在省份编码_w2v_1  客户编号_所在省份编码_w2v_2  客户编号_所在省份编码_w2v_3  客户编号_所在省份编码_w2v_4  \\\n",
       "0          -0.015712          -0.084828            0.08317           0.032441   \n",
       "1           0.035966          -0.024396            0.08069           0.011357   \n",
       "\n",
       "   客户编号_所在省份编码_w2v_5  客户编号_所在省份编码_w2v_6  客户编号_所在省份编码_w2v_7  \\\n",
       "0           0.050066          -0.100693          -0.046908   \n",
       "1          -0.014146          -0.012385          -0.068190   \n",
       "\n",
       "   客户编号_国民经济行业代码_w2v_0  客户编号_国民经济行业代码_w2v_1  客户编号_国民经济行业代码_w2v_2  \\\n",
       "0            -0.095276            -0.031454             0.106618   \n",
       "1            -0.075757            -0.017145             0.007416   \n",
       "\n",
       "   客户编号_国民经济行业代码_w2v_3  客户编号_国民经济行业代码_w2v_4  客户编号_国民经济行业代码_w2v_5  \\\n",
       "0            -0.018529            -0.096568             0.037983   \n",
       "1             0.031503             0.006594            -0.034836   \n",
       "\n",
       "   客户编号_国民经济行业代码_w2v_6  客户编号_国民经济行业代码_w2v_7  \n",
       "0             0.115654             0.091873  \n",
       "1             0.115443             0.003185  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "basic_text = BASIC_text()\n",
    "print(basic_text.shape)\n",
    "basic_text.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a1f930b4-2a3e-41a4-99df-1c71c14003be",
   "metadata": {},
   "source": [
    "# 交易明细业务特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "35655e81-bd28-489c-acd1-c0bf5f94edef",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-11T02:47:53.669140Z",
     "iopub.status.busy": "2024-11-11T02:47:53.668592Z",
     "iopub.status.idle": "2024-11-11T02:47:53.717472Z",
     "msg_id": "5ff4d210-098b-4057-b016-11cacd3c2586",
     "shell.execute_reply": "2024-11-11T02:47:53.716706Z",
     "shell.execute_reply.started": "2024-11-11T02:47:53.669111Z"
    }
   },
   "outputs": [],
   "source": [
    "def FNCL_TR_DTAL_info():\n",
    "    data = get_data(file_name=\"XW_ENTINFO_FNCL_TR_DTAL\")\n",
    "    data_1 = get_data(file_name=\"XW_ENTINFO_BASIC\")\n",
    "    data = data.merge(data_1[[\"数据日期\",\"客户编号\"]], how='left', on=[\"客户编号\"])\n",
    "    low_day_train = [20020428, 20020429, 20020502, 20020505, 20020506, 20020512, 20020513, 20020519, 20020526, 20020527, 20020528, 20020529, 20020530, 20020603, 20020609, 20020610, 20020616, 20020617, 20020623, 20020624, 20020630, 20020701, 20020707, 20020708, 20020714, 20020715, 20020719, 20020720, 20020721]\n",
    "    low_day_A = [20020529, 20020530, 20020602, 20020605, 20020606, 20020612, 20020613, 20020619, 20020626, 20020627, 20020628, 20020629, 20020630, 20020704, 20020710, 20020711, 20020717, 20020718, 20020724, 20020725, 20020731, 20020801, 20020807, 20020808, 20020814, 20020815, 20020819, 20020820, 20020821]\n",
    "    xx = data[data[\"is_train\"] == 1]\n",
    "    yy = data[data[\"is_train\"] == 0]\n",
    "    xx[\"是否节假日或周末\"] = xx[\"交易日期\"].apply(lambda x:1 if x in low_day_train else 0)\n",
    "    yy[\"是否节假日或周末\"] = yy[\"交易日期\"].apply(lambda x:1 if x in low_day_A else 0)\n",
    "    data = pd.concat([xx, yy])\n",
    "    data['交易日期'] = data['交易日期'].astype('str')\n",
    "    data['交易日期'] = data['交易日期'].astype('datetime64[ns]')\n",
    "    data['数据日期'] = data['数据日期'].astype('str')\n",
    "    data['数据日期'] = data['数据日期'].astype('datetime64[ns]')\n",
    "    data['数据日期_距离_交易_天数']= data.apply(lambda x:(x['数据日期']-x['交易日期']).days, axis=1)\n",
    "    \n",
    "    ######折人民币交易金额    \n",
    "    #交易金额占比\n",
    "    data_agg = pd.DataFrame(TARGET['客户编号'])\n",
    "    \n",
    "    data['交易金额占比'] = data['折人民币交易金额']/(data['合约账户余额'] + 0.00001)\n",
    "    \n",
    "    temp1 = data.groupby('客户编号').agg({'交易金额占比': ['mean', 'max', 'min']})\n",
    "    if data_agg.empty :\n",
    "        data_agg = copy.deepcopy(temp1)\n",
    "    else:\n",
    "        data_agg = data_agg.merge(temp1, how=\"left\", on=\"客户编号\")\n",
    "\n",
    "    temp2 = data[data['记账方向代码'] == '16459755d990723240edb88e34a13fab'].groupby(['客户编号']).agg({'交易金额占比': ['mean', 'max', 'min']})\n",
    "    data_agg = data_agg.merge(temp2, how=\"left\", on=\"客户编号\")\n",
    "    temp3 = data[data['记账方向代码'] == '1250d7cb654a81c7b9366dabf57fe62b'].groupby(['客户编号']).agg({'交易金额占比': ['mean', 'max', 'min']})\n",
    "    data_agg = data_agg.merge(temp3, how=\"left\", on=\"客户编号\")\n",
    "    temp4 = data[data['同名账户标识'] == 1].groupby(['客户编号']).agg({'交易金额占比': ['mean', 'max', 'min']})\n",
    "    data_agg = data_agg.merge(temp4, how=\"left\", on=\"客户编号\")\n",
    "    temp5 = data[data['同名账户标识'] == 0].groupby(['客户编号']).agg({'交易金额占比': ['mean', 'max', 'min']})\n",
    "    data_agg = data_agg.merge(temp5, how=\"left\", on=\"客户编号\")\n",
    "\n",
    "    temp6 = data.groupby('客户编号').agg({'折人民币交易金额': 'count'})\n",
    "    data_agg = data_agg.merge(temp6, how=\"left\", on=\"客户编号\")\n",
    "\n",
    "    data_agg.columns = ['客户编号','总交易金额占比_mean', '总交易金额占比_max', '总交易金额占比_min', '转入金额占比_mean', '转入金额占比_max', '转入金额占比_min',\n",
    "                   '转出金额占比_mean', '转出金额占比_max', '转出金额占比_min', '本人金额占比_mean', '本人金额占比_max', '本人金额占比_min',\n",
    "                   '非本人金额占比_mean', '非本人金额占比_max', '非本人金额占比_min', '总交易次数']\n",
    "    #交易次数异常\n",
    "    data_agg['交易次数小于等于5'] =  np.where((data_agg['总交易次数'] <= 5), 1, 0)\n",
    "    data_agg = data_agg.drop(['总交易次数'], axis = 1)\n",
    "    \n",
    "    #总流入流出\n",
    "    data_temp1 = data[data['记账方向代码'] == '16459755d990723240edb88e34a13fab']\n",
    "    temp1 = data_temp1.groupby(['客户编号']).agg({'折人民币交易金额':['sum', 'count', 'mean', 'std', 'max']})\n",
    "    temp1.reset_index(inplace=True)\n",
    "    temp1.columns = ['客户编号','总流出金额','总流出笔数', '流出平均金额', '流出金额方差', '流出金额最大值']\n",
    "    data_temp2 = data[data['记账方向代码'] == '1250d7cb654a81c7b9366dabf57fe62b']\n",
    "    temp2 = data_temp2.groupby(['客户编号']).agg({'折人民币交易金额':['sum', 'count', 'mean', 'std', 'max']})\n",
    "    temp2.reset_index(inplace=True)\n",
    "    temp2.columns = ['客户编号','总流入金额','总流入笔数', '流入平均金额', '流入金额方差', '流入金额最大值']\n",
    "    data_agg = data_agg.merge(temp1, how = 'left', on = '客户编号')\n",
    "    data_agg = data_agg.merge(temp2, how = 'left', on = '客户编号')\n",
    "\n",
    "    data_agg['总净流'] = data_agg['总流入金额'] - data_agg['总流出金额']\n",
    "    data_agg['总金额'] = data_agg['总流入金额'] + data_agg['总流出金额']\n",
    "    data_agg['总笔数'] = data_agg['总流入笔数'] + data_agg['总流出笔数']\n",
    "    \n",
    "    #最后一个月\n",
    "    data_temp1 = data[(data['数据日期_距离_交易_天数'] <= 30) & (data['记账方向代码'] == '16459755d990723240edb88e34a13fab')]\n",
    "    temp1 = data_temp1.groupby(['客户编号']).agg({'折人民币交易金额':['sum', 'count']})\n",
    "    temp1.reset_index(inplace=True)\n",
    "    temp1.columns = ['客户编号','近一月流出金额','近一月流出笔数']\n",
    "    data_temp2 = data[(data['数据日期_距离_交易_天数'] <= 30) & (data['记账方向代码'] == '1250d7cb654a81c7b9366dabf57fe62b')]\n",
    "    temp2 = data_temp2.groupby(['客户编号']).agg({'折人民币交易金额':['sum', 'count']})\n",
    "    temp2.reset_index(inplace=True)\n",
    "    temp2.columns = ['客户编号','近一月流入金额','近一月流入笔数']\n",
    "    data_agg = data_agg.merge(temp1, how = 'left', on = '客户编号')\n",
    "    data_agg = data_agg.merge(temp2, how = 'left', on = '客户编号')\n",
    "    data_agg['近一月总净流'] = data_agg['近一月流入金额'] - data_agg['近一月流出金额']\n",
    "    data_agg['近一月总金额'] = data_agg['近一月流入金额'] + data_agg['近一月流出金额']\n",
    "    data_agg['近一月总笔数'] = data_agg['近一月流入笔数'] + data_agg['近一月流出笔数']\n",
    "\n",
    "    #倒数第三月\n",
    "    data_temp1 = data[(data['数据日期_距离_交易_天数'] >= 60) & (data['记账方向代码'] == '16459755d990723240edb88e34a13fab')]\n",
    "    temp1 = data_temp1.groupby(['客户编号']).agg({'折人民币交易金额':['sum', 'count']})\n",
    "    temp1.reset_index(inplace=True)\n",
    "    temp1.columns = ['客户编号','倒数第三月流出金额','倒数第三月流出笔数']\n",
    "    data_temp2 = data[(data['数据日期_距离_交易_天数'] >= 60) & (data['记账方向代码'] == '1250d7cb654a81c7b9366dabf57fe62b')]\n",
    "    temp2 = data_temp2.groupby(['客户编号']).agg({'折人民币交易金额':['sum', 'count']})\n",
    "    temp2.reset_index(inplace=True)\n",
    "    temp2.columns = ['客户编号','倒数第三月流入金额','倒数第三月流入笔数']\n",
    "    data_agg = data_agg.merge(temp1, how = 'left', on = '客户编号')\n",
    "    data_agg = data_agg.merge(temp2, how = 'left', on = '客户编号')\n",
    "    data_agg['倒数第三月总净流'] = data_agg['倒数第三月流入金额'] - data_agg['倒数第三月流出金额']\n",
    "    data_agg['倒数第三月总金额'] = data_agg['倒数第三月流入金额'] + data_agg['倒数第三月流出金额']\n",
    "    data_agg['倒数第三月总笔数'] = data_agg['倒数第三月流入笔数'] + data_agg['倒数第三月流出笔数']\n",
    "\n",
    "    #第一个月和第三月的趋势\n",
    "    data_agg['第三个月与第一个月流入金额差'] = data_agg['近一月流入金额'] - data_agg['倒数第三月流入金额']\n",
    "    data_agg['第三个月与第一个月流出金额差'] = data_agg['近一月流出金额'] - data_agg['倒数第三月流出金额']\n",
    "    data_agg['第三个月与第一个月总金额差'] = data_agg['近一月总金额'] - data_agg['倒数第三月总金额']\n",
    "    data_agg['第三个月与第一个月流出笔数差'] = data_agg['近一月流出笔数'] - data_agg['倒数第三月流出笔数']\n",
    "    data_agg['第三个月与第一个月流入笔数差'] = data_agg['近一月流入笔数'] - data_agg['倒数第三月流入笔数']\n",
    "    data_agg['第三个月与第一个月总笔数差'] = data_agg['近一月总笔数'] - data_agg['倒数第三月总笔数']\n",
    "\n",
    "    \n",
    "    #对手账号：相关客户数\n",
    "    temp = data.groupby(['客户编号']).agg({'交易对手客户编号':['nunique']})\n",
    "    temp.reset_index(inplace=True)\n",
    "    temp.columns = ['客户编号','相关客户数']\n",
    "    data_agg = data_agg.merge(temp, how = 'left', on = '客户编号')\n",
    "    \n",
    "    #最后交易日,最早交易日\n",
    "    t = data[[\"客户编号\", \"交易日期\"]].groupby(by=\"客户编号\").agg({\"交易日期\":[\"max\", \"min\"]})\n",
    "    t.columns = [c[0] + c[1] for c in t.columns.tolist()]\n",
    "    t.reset_index(drop=False,inplace=True)\n",
    "    t.columns = ['客户编号','最后交易日期', '最早交易日期']\n",
    "    data = data.merge(t, how=\"left\", on=\"客户编号\")\n",
    "    \n",
    "    data['最后交易离现在差值'] =  data.apply(lambda x:(x['数据日期']-x['最后交易日期']).days, axis=1)\n",
    "    #data_agg['企业时间跨度'] =  data.apply(lambda x:(x['最后交易日期']-x['最早交易日期']).days, axis=1)\n",
    "    \n",
    "    data_temp = data[data['最后交易离现在差值'] == 0]\n",
    "    temp1 = data_temp[data_temp['记账方向代码'] == '16459755d990723240edb88e34a13fab'].groupby(['客户编号']).agg({'折人民币交易金额':['sum', 'count']})\n",
    "    temp1.reset_index(inplace=True)\n",
    "    temp1.columns = ['客户编号','最后交易日流出金额','最后交易日流出笔数']\n",
    "    data_agg = data_agg.merge(temp1, how=\"left\", on=\"客户编号\")\n",
    "    temp2 = data_temp[data_temp['记账方向代码'] == '1250d7cb654a81c7b9366dabf57fe62b'].groupby(['客户编号']).agg({'折人民币交易金额':['sum', 'count']})\n",
    "    temp2.reset_index(inplace=True)\n",
    "    temp2.columns = ['客户编号','最后交易日流入金额','最后交易日流入笔数']\n",
    "    data_agg = data_agg.merge(temp2, how=\"left\", on=\"客户编号\")\n",
    "    data_agg['最后交易日总净流'] = data_agg['最后交易日流入金额'] - data_agg['最后交易日流出金额']\n",
    "    data_agg['最后交易日总金额'] = data_agg['最后交易日流入金额'] + data_agg['最后交易日流出金额']\n",
    "    data_agg['最后交易日总笔数'] = data_agg['最后交易日流入笔数'] + data_agg['最后交易日流出笔数']\n",
    "\n",
    "    #非工作日交易\n",
    "    temp1 = data[data['是否节假日或周末'] == 1].groupby(['客户编号']).agg({'折人民币交易金额':['sum', 'count']})\n",
    "    temp1.reset_index(inplace=True)\n",
    "    temp1.columns = ['客户编号','非工作日交易金额','非工作日交易笔数']\n",
    "    data_agg = data_agg.merge(temp1, how=\"left\", on=\"客户编号\")\n",
    "\n",
    "    #近三月汇总,总金额和总笔数上面已经做了\n",
    "    temp = data.groupby(['客户编号']).agg({'折人民币交易金额':['max','min','mean','std']})\n",
    "    temp.reset_index(inplace=True)\n",
    "    temp.columns = ['客户编号','企业交易绝对值最高金额','企业交易绝对值最低金额', '企业交易绝对值_mean', '企业交易绝对值_std']\n",
    "    data_agg = data_agg.merge(temp, how=\"left\", on=\"客户编号\")\n",
    "\n",
    "    \n",
    "    ###合约账户余额\n",
    "    #最后一月合约账户余额\n",
    "    data_temp1 = data[data['数据日期_距离_交易_天数'] <= 30]\n",
    "    temp1 = data_temp1.groupby(['客户编号']).agg({'合约账户余额':['mean', 'max', 'std']})\n",
    "    temp1.reset_index(inplace=True)\n",
    "    temp1.columns = ['客户编号','近一月平均账户余额','近一月最大账户余额', '近一月账户余额方差']\n",
    "    data_agg = data_agg.merge(temp1, how=\"left\", on=\"客户编号\")\n",
    "\n",
    "    #倒数第三个月合约账户余额\n",
    "    data_temp1 = data[data['数据日期_距离_交易_天数'] >= 60]\n",
    "    temp1 = data_temp1.groupby(['客户编号']).agg({'合约账户余额':['mean', 'max', 'std']})\n",
    "    temp1.reset_index(inplace=True)\n",
    "    temp1.columns = ['客户编号','倒数第三个月平均账户余额','倒数第三个月最大账户余额', '倒数第三个月账户余额方差']\n",
    "    data_agg = data_agg.merge(temp1, how=\"left\", on=\"客户编号\")\n",
    "\n",
    "    #第一月和第三月的账户余额差值\n",
    "    data_agg['第三个月与第一个月余额均值差'] = data_agg['近一月平均账户余额'] - data_agg['倒数第三个月平均账户余额']\n",
    "    data_agg['第三个月与第一个月余额最大差'] = data_agg['近一月最大账户余额'] - data_agg['倒数第三个月最大账户余额']\n",
    "\n",
    "    \n",
    "    #第一月和第三月交易代码、渠道代码差值\n",
    "    data_temp1 = data[data['数据日期_距离_交易_天数'] <= 30]\n",
    "    temp1 = data_temp1.groupby(['客户编号']).agg({'交易代码':['nunique'], '渠道代码':['nunique']})\n",
    "    temp1.reset_index(inplace=True)\n",
    "    temp1.columns = ['客户编号','近一月交易代码个数','近一月渠道代码个数']\n",
    "    data_agg = data_agg.merge(temp1, how=\"left\", on=\"客户编号\")\n",
    "\n",
    "    data_temp1 = data[data['数据日期_距离_交易_天数'] >= 60]\n",
    "    temp1 = data_temp1.groupby(['客户编号']).agg({'交易代码':['nunique'], '渠道代码':['nunique']})\n",
    "    temp1.reset_index(inplace=True)\n",
    "    temp1.columns = ['客户编号','倒数第三月交易代码个数','倒数第三月渠道代码个数']\n",
    "    data_agg = data_agg.merge(temp1, how=\"left\", on=\"客户编号\")\n",
    "\n",
    "    data_agg['第三个月与第一个月渠道数差'] = data_agg['近一月渠道代码个数'] - data_agg['倒数第三月渠道代码个数']\n",
    "    data_agg['第三个月与第一个月交易代码数差'] = data_agg['近一月交易代码个数'] - data_agg['倒数第三月交易代码个数']\n",
    "\n",
    "    return data_agg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "f1bbd00d-86e5-453b-b85a-3531a61ccbc4",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-11T02:47:53.718527Z",
     "iopub.status.busy": "2024-11-11T02:47:53.718289Z",
     "iopub.status.idle": "2024-11-11T02:58:12.227835Z",
     "msg_id": "04f8f96d-5882-4dc0-8c57-3817861c02c1",
     "shell.execute_reply": "2024-11-11T02:58:12.227086Z",
     "shell.execute_reply.started": "2024-11-11T02:47:53.718502Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(59079, 78)\n"
     ]
    }
   ],
   "source": [
    "fncl_tr_dtal_info = FNCL_TR_DTAL_info()\n",
    "print(fncl_tr_dtal_info.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "df4b01cb-c49b-40eb-b0b6-29559f75877a",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-11T02:58:12.229387Z",
     "iopub.status.busy": "2024-11-11T02:58:12.228969Z",
     "iopub.status.idle": "2024-11-11T02:58:12.241627Z",
     "msg_id": "df3a12d8-ae3c-4df7-a672-df9081e00745",
     "shell.execute_reply": "2024-11-11T02:58:12.240867Z",
     "shell.execute_reply.started": "2024-11-11T02:58:12.229358Z"
    }
   },
   "outputs": [],
   "source": [
    "def FNCL_TR_DTAL_info_2():\n",
    "    file_name = 'XW_ENTINFO_FNCL_TR_DTAL'\n",
    "    FNCL_TR_DTAL = get_data(file_name, num_rows=None)\n",
    "\n",
    "    data = FNCL_TR_DTAL.copy()\n",
    "    low_day_train = [20020428, 20020429, 20020502, 20020505, 20020506, 20020512, 20020513, 20020519, 20020526, 20020527, 20020528, 20020529, 20020530, 20020603, 20020609, 20020610,\n",
    "              20020616, 20020617, 20020623, 20020624, 20020630, 20020701, 20020707, 20020708, 20020714, 20020715, 20020719, 20020720, 20020721]\n",
    "    low_day_A = [20020529, 20020530, 20020602, 20020605, 20020606, 20020612, 20020613, 20020619, 20020626, 20020627, 20020628, 20020629, 20020630, 20020704, 20020710, 20020711, \n",
    "               20020717, 20020718, 20020724, 20020725, 20020731, 20020801, 20020807, 20020808, 20020814, 20020815, 20020819, 20020820, 20020821]\n",
    "\n",
    "    train_data = data[data['is_train'] == 1]\n",
    "    a = data[data['交易日期'] == 20020428]\n",
    "    \n",
    "    for i in [20020429, 20020502, 20020505, 20020506, 20020512, 20020513, 20020519, 20020526, 20020527, 20020528, 20020529, 20020530, 20020603, 20020609, 20020610,\n",
    "              20020616, 20020617, 20020623, 20020624, 20020630, 20020701, 20020707, 20020708, 20020714, 20020715, 20020719, 20020720, 20020721]:\n",
    "        tmp = train_data[train_data['交易日期'] == i]\n",
    "        a = pd.concat([a, tmp])\n",
    "\n",
    "    test_data = data[data['is_train'] == 0]\n",
    "    for i in low_day_A:\n",
    "        tmp = test_data[test_data['交易日期'] == i]\n",
    "        a = pd.concat([a, tmp])\n",
    "\n",
    "    columns = ['交易代码_count', '渠道代码_count', '合约账户余额_mean', '合约账户余额_min', '合约账户余额_max', '合约账户余额_std', '折人民币交易金额_count', '折人民币交易金额_sum', \n",
    "                '折人民币交易金额_max', '折人民币交易金额_std', '折人民币交易金额_skew', '折人民币交易金额_count_notzero', '折人民币交易金额_count_zero', '交易对手客户编号_nunique']  \n",
    "    base_func = {\n",
    "        '交易代码': 'count',\n",
    "        '渠道代码': 'count',\n",
    "        '合约账户余额': ['mean', 'min', 'max', 'std'],\n",
    "        '折人民币交易金额': ['count', 'sum', 'max', 'std', 'skew', count_notzero, count_zero],\n",
    "        '交易对手客户编号': 'nunique'\n",
    "    }\n",
    "    merge = a.groupby('客户编号').agg(base_func)\n",
    "    merge = merge.reset_index()\n",
    "    merge.columns = ['客户编号'] + [column + '_低频交易日' for column in columns]\n",
    "    return merge\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "af7b8fdb-7b4a-42cc-806d-def2dd1d7d53",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-11T02:58:12.242670Z",
     "iopub.status.busy": "2024-11-11T02:58:12.242435Z",
     "iopub.status.idle": "2024-11-11T02:59:39.272362Z",
     "msg_id": "dbc2ee9b-078d-42e5-9f18-0a2a58adc9d5",
     "shell.execute_reply": "2024-11-11T02:59:39.271667Z",
     "shell.execute_reply.started": "2024-11-11T02:58:12.242646Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(55222, 15)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>客户编号</th>\n",
       "      <th>交易代码_count_低频交易日</th>\n",
       "      <th>渠道代码_count_低频交易日</th>\n",
       "      <th>合约账户余额_mean_低频交易日</th>\n",
       "      <th>合约账户余额_min_低频交易日</th>\n",
       "      <th>合约账户余额_max_低频交易日</th>\n",
       "      <th>合约账户余额_std_低频交易日</th>\n",
       "      <th>折人民币交易金额_count_低频交易日</th>\n",
       "      <th>折人民币交易金额_sum_低频交易日</th>\n",
       "      <th>折人民币交易金额_max_低频交易日</th>\n",
       "      <th>折人民币交易金额_std_低频交易日</th>\n",
       "      <th>折人民币交易金额_skew_低频交易日</th>\n",
       "      <th>折人民币交易金额_count_notzero_低频交易日</th>\n",
       "      <th>折人民币交易金额_count_zero_低频交易日</th>\n",
       "      <th>交易对手客户编号_nunique_低频交易日</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>000034607497713173a75a0d9910cb52</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>36.574</td>\n",
       "      <td>32.43</td>\n",
       "      <td>40.16</td>\n",
       "      <td>3.135248</td>\n",
       "      <td>5</td>\n",
       "      <td>93.63</td>\n",
       "      <td>21.7</td>\n",
       "      <td>2.372863</td>\n",
       "      <td>-0.727625</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               客户编号  交易代码_count_低频交易日  渠道代码_count_低频交易日  \\\n",
       "0  000034607497713173a75a0d9910cb52                 5                 5   \n",
       "\n",
       "   合约账户余额_mean_低频交易日  合约账户余额_min_低频交易日  合约账户余额_max_低频交易日  合约账户余额_std_低频交易日  \\\n",
       "0             36.574             32.43             40.16          3.135248   \n",
       "\n",
       "   折人民币交易金额_count_低频交易日  折人民币交易金额_sum_低频交易日  折人民币交易金额_max_低频交易日  \\\n",
       "0                     5               93.63                21.7   \n",
       "\n",
       "   折人民币交易金额_std_低频交易日  折人民币交易金额_skew_低频交易日  折人民币交易金额_count_notzero_低频交易日  \\\n",
       "0            2.372863            -0.727625                             5   \n",
       "\n",
       "   折人民币交易金额_count_zero_低频交易日  交易对手客户编号_nunique_低频交易日  \n",
       "0                          0                       0  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fncl_tr_dtal_info_2 = FNCL_TR_DTAL_info_2()\n",
    "print(fncl_tr_dtal_info_2.shape)\n",
    "fncl_tr_dtal_info_2.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "220d2caa-7d8c-42dd-b13f-0fab53acac8a",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-11T02:59:39.273701Z",
     "iopub.status.busy": "2024-11-11T02:59:39.273369Z",
     "iopub.status.idle": "2024-11-11T02:59:39.433995Z",
     "msg_id": "8c7a9de6-09cc-43d2-bd63-4bd3ea70f243",
     "shell.execute_reply": "2024-11-11T02:59:39.433331Z",
     "shell.execute_reply.started": "2024-11-11T02:59:39.273672Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>客户编号</th>\n",
       "      <th>总交易金额占比_mean</th>\n",
       "      <th>总交易金额占比_max</th>\n",
       "      <th>总交易金额占比_min</th>\n",
       "      <th>转入金额占比_mean</th>\n",
       "      <th>转入金额占比_max</th>\n",
       "      <th>转入金额占比_min</th>\n",
       "      <th>转出金额占比_mean</th>\n",
       "      <th>转出金额占比_max</th>\n",
       "      <th>转出金额占比_min</th>\n",
       "      <th>本人金额占比_mean</th>\n",
       "      <th>本人金额占比_max</th>\n",
       "      <th>本人金额占比_min</th>\n",
       "      <th>非本人金额占比_mean</th>\n",
       "      <th>非本人金额占比_max</th>\n",
       "      <th>非本人金额占比_min</th>\n",
       "      <th>交易次数小于等于5</th>\n",
       "      <th>总流出金额</th>\n",
       "      <th>总流出笔数</th>\n",
       "      <th>流出平均金额</th>\n",
       "      <th>流出金额方差</th>\n",
       "      <th>流出金额最大值</th>\n",
       "      <th>总流入金额</th>\n",
       "      <th>总流入笔数</th>\n",
       "      <th>流入平均金额</th>\n",
       "      <th>流入金额方差</th>\n",
       "      <th>流入金额最大值</th>\n",
       "      <th>总净流</th>\n",
       "      <th>总金额</th>\n",
       "      <th>总笔数</th>\n",
       "      <th>近一月流出金额</th>\n",
       "      <th>近一月流出笔数</th>\n",
       "      <th>近一月流入金额</th>\n",
       "      <th>近一月流入笔数</th>\n",
       "      <th>近一月总净流</th>\n",
       "      <th>近一月总金额</th>\n",
       "      <th>近一月总笔数</th>\n",
       "      <th>倒数第三月流出金额</th>\n",
       "      <th>倒数第三月流出笔数</th>\n",
       "      <th>倒数第三月流入金额</th>\n",
       "      <th>倒数第三月流入笔数</th>\n",
       "      <th>倒数第三月总净流</th>\n",
       "      <th>倒数第三月总金额</th>\n",
       "      <th>倒数第三月总笔数</th>\n",
       "      <th>第三个月与第一个月流入金额差</th>\n",
       "      <th>第三个月与第一个月流出金额差</th>\n",
       "      <th>第三个月与第一个月总金额差</th>\n",
       "      <th>第三个月与第一个月流出笔数差</th>\n",
       "      <th>第三个月与第一个月流入笔数差</th>\n",
       "      <th>第三个月与第一个月总笔数差</th>\n",
       "      <th>相关客户数</th>\n",
       "      <th>最后交易日流出金额</th>\n",
       "      <th>最后交易日流出笔数</th>\n",
       "      <th>最后交易日流入金额</th>\n",
       "      <th>最后交易日流入笔数</th>\n",
       "      <th>最后交易日总净流</th>\n",
       "      <th>最后交易日总金额</th>\n",
       "      <th>最后交易日总笔数</th>\n",
       "      <th>非工作日交易金额</th>\n",
       "      <th>非工作日交易笔数</th>\n",
       "      <th>企业交易绝对值最高金额</th>\n",
       "      <th>企业交易绝对值最低金额</th>\n",
       "      <th>企业交易绝对值_mean</th>\n",
       "      <th>企业交易绝对值_std</th>\n",
       "      <th>近一月平均账户余额</th>\n",
       "      <th>近一月最大账户余额</th>\n",
       "      <th>近一月账户余额方差</th>\n",
       "      <th>倒数第三个月平均账户余额</th>\n",
       "      <th>倒数第三个月最大账户余额</th>\n",
       "      <th>倒数第三个月账户余额方差</th>\n",
       "      <th>第三个月与第一个月余额均值差</th>\n",
       "      <th>第三个月与第一个月余额最大差</th>\n",
       "      <th>近一月交易代码个数</th>\n",
       "      <th>近一月渠道代码个数</th>\n",
       "      <th>倒数第三月交易代码个数</th>\n",
       "      <th>倒数第三月渠道代码个数</th>\n",
       "      <th>第三个月与第一个月渠道数差</th>\n",
       "      <th>第三个月与第一个月交易代码数差</th>\n",
       "      <th>交易代码_count_低频交易日</th>\n",
       "      <th>渠道代码_count_低频交易日</th>\n",
       "      <th>合约账户余额_mean_低频交易日</th>\n",
       "      <th>合约账户余额_min_低频交易日</th>\n",
       "      <th>合约账户余额_max_低频交易日</th>\n",
       "      <th>合约账户余额_std_低频交易日</th>\n",
       "      <th>折人民币交易金额_count_低频交易日</th>\n",
       "      <th>折人民币交易金额_sum_低频交易日</th>\n",
       "      <th>折人民币交易金额_max_低频交易日</th>\n",
       "      <th>折人民币交易金额_std_低频交易日</th>\n",
       "      <th>折人民币交易金额_skew_低频交易日</th>\n",
       "      <th>折人民币交易金额_count_notzero_低频交易日</th>\n",
       "      <th>折人民币交易金额_count_zero_低频交易日</th>\n",
       "      <th>交易对手客户编号_nunique_低频交易日</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>158a8d99bec2a2b652a6de45a2b52ec9</td>\n",
       "      <td>141147.721138</td>\n",
       "      <td>1740000.0</td>\n",
       "      <td>0.014808</td>\n",
       "      <td>161000.670796</td>\n",
       "      <td>1183000.0</td>\n",
       "      <td>0.014808</td>\n",
       "      <td>116000.651573</td>\n",
       "      <td>1740000.0</td>\n",
       "      <td>0.147927</td>\n",
       "      <td>1.629809</td>\n",
       "      <td>2.799176</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.985128</td>\n",
       "      <td>0.985128</td>\n",
       "      <td>0.985128</td>\n",
       "      <td>0</td>\n",
       "      <td>583.82</td>\n",
       "      <td>19.0</td>\n",
       "      <td>30.727368</td>\n",
       "      <td>38.514283</td>\n",
       "      <td>106.75</td>\n",
       "      <td>418.88</td>\n",
       "      <td>15.0</td>\n",
       "      <td>27.925333</td>\n",
       "      <td>36.134326</td>\n",
       "      <td>106.75</td>\n",
       "      <td>-164.94</td>\n",
       "      <td>1002.7</td>\n",
       "      <td>34.0</td>\n",
       "      <td>130.15</td>\n",
       "      <td>5.0</td>\n",
       "      <td>78.81</td>\n",
       "      <td>2.0</td>\n",
       "      <td>-51.34</td>\n",
       "      <td>208.96</td>\n",
       "      <td>7.0</td>\n",
       "      <td>388.28</td>\n",
       "      <td>9.0</td>\n",
       "      <td>213.5</td>\n",
       "      <td>2.0</td>\n",
       "      <td>-174.78</td>\n",
       "      <td>601.78</td>\n",
       "      <td>11.0</td>\n",
       "      <td>-134.69</td>\n",
       "      <td>-258.13</td>\n",
       "      <td>-392.82</td>\n",
       "      <td>-4.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-4.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>69.6</td>\n",
       "      <td>4.0</td>\n",
       "      <td>106.75</td>\n",
       "      <td>1.24</td>\n",
       "      <td>29.491176</td>\n",
       "      <td>36.946221</td>\n",
       "      <td>30.41</td>\n",
       "      <td>75.98</td>\n",
       "      <td>21.636022</td>\n",
       "      <td>69.550909</td>\n",
       "      <td>106.75</td>\n",
       "      <td>26.308696</td>\n",
       "      <td>-39.140909</td>\n",
       "      <td>-30.77</td>\n",
       "      <td>6.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>18.545</td>\n",
       "      <td>0.0</td>\n",
       "      <td>39.38</td>\n",
       "      <td>16.131091</td>\n",
       "      <td>4.0</td>\n",
       "      <td>69.6</td>\n",
       "      <td>17.4</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               客户编号   总交易金额占比_mean  总交易金额占比_max  总交易金额占比_min  \\\n",
       "0  158a8d99bec2a2b652a6de45a2b52ec9  141147.721138    1740000.0     0.014808   \n",
       "\n",
       "     转入金额占比_mean  转入金额占比_max  转入金额占比_min    转出金额占比_mean  转出金额占比_max  \\\n",
       "0  161000.670796   1183000.0    0.014808  116000.651573   1740000.0   \n",
       "\n",
       "   转出金额占比_min  本人金额占比_mean  本人金额占比_max  本人金额占比_min  非本人金额占比_mean  非本人金额占比_max  \\\n",
       "0    0.147927     1.629809    2.799176         1.0      0.985128     0.985128   \n",
       "\n",
       "   非本人金额占比_min  交易次数小于等于5   总流出金额  总流出笔数     流出平均金额     流出金额方差  流出金额最大值  \\\n",
       "0     0.985128          0  583.82   19.0  30.727368  38.514283   106.75   \n",
       "\n",
       "    总流入金额  总流入笔数     流入平均金额     流入金额方差  流入金额最大值     总净流     总金额   总笔数  \\\n",
       "0  418.88   15.0  27.925333  36.134326   106.75 -164.94  1002.7  34.0   \n",
       "\n",
       "   近一月流出金额  近一月流出笔数  近一月流入金额  近一月流入笔数  近一月总净流  近一月总金额  近一月总笔数  倒数第三月流出金额  \\\n",
       "0   130.15      5.0    78.81      2.0  -51.34  208.96     7.0     388.28   \n",
       "\n",
       "   倒数第三月流出笔数  倒数第三月流入金额  倒数第三月流入笔数  倒数第三月总净流  倒数第三月总金额  倒数第三月总笔数  \\\n",
       "0        9.0      213.5        2.0   -174.78    601.78      11.0   \n",
       "\n",
       "   第三个月与第一个月流入金额差  第三个月与第一个月流出金额差  第三个月与第一个月总金额差  第三个月与第一个月流出笔数差  \\\n",
       "0         -134.69         -258.13        -392.82            -4.0   \n",
       "\n",
       "   第三个月与第一个月流入笔数差  第三个月与第一个月总笔数差  相关客户数  最后交易日流出金额  最后交易日流出笔数  最后交易日流入金额  \\\n",
       "0             0.0           -4.0    2.0        NaN        NaN        NaN   \n",
       "\n",
       "   最后交易日流入笔数  最后交易日总净流  最后交易日总金额  最后交易日总笔数  非工作日交易金额  非工作日交易笔数  企业交易绝对值最高金额  \\\n",
       "0        NaN       NaN       NaN       NaN      69.6       4.0       106.75   \n",
       "\n",
       "   企业交易绝对值最低金额  企业交易绝对值_mean  企业交易绝对值_std  近一月平均账户余额  近一月最大账户余额  近一月账户余额方差  \\\n",
       "0         1.24     29.491176    36.946221      30.41      75.98  21.636022   \n",
       "\n",
       "   倒数第三个月平均账户余额  倒数第三个月最大账户余额  倒数第三个月账户余额方差  第三个月与第一个月余额均值差  第三个月与第一个月余额最大差  \\\n",
       "0     69.550909        106.75     26.308696      -39.140909          -30.77   \n",
       "\n",
       "   近一月交易代码个数  近一月渠道代码个数  倒数第三月交易代码个数  倒数第三月渠道代码个数  第三个月与第一个月渠道数差  \\\n",
       "0        6.0        4.0          5.0          4.0            0.0   \n",
       "\n",
       "   第三个月与第一个月交易代码数差  交易代码_count_低频交易日  渠道代码_count_低频交易日  合约账户余额_mean_低频交易日  \\\n",
       "0              1.0               4.0               4.0             18.545   \n",
       "\n",
       "   合约账户余额_min_低频交易日  合约账户余额_max_低频交易日  合约账户余额_std_低频交易日  折人民币交易金额_count_低频交易日  \\\n",
       "0               0.0             39.38         16.131091                   4.0   \n",
       "\n",
       "   折人民币交易金额_sum_低频交易日  折人民币交易金额_max_低频交易日  折人民币交易金额_std_低频交易日  \\\n",
       "0                69.6                17.4                 0.0   \n",
       "\n",
       "   折人民币交易金额_skew_低频交易日  折人民币交易金额_count_notzero_低频交易日  \\\n",
       "0                  0.0                           4.0   \n",
       "\n",
       "   折人民币交易金额_count_zero_低频交易日  交易对手客户编号_nunique_低频交易日  \n",
       "0                        0.0                     1.0  "
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fncl_tr_dtal_info_all = fncl_tr_dtal_info.merge(fncl_tr_dtal_info_2, on = '客户编号', how = 'left')\n",
    "fncl_tr_dtal_info_all.head(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5ec39658-7cc4-4907-a1a9-a4bc460fcbfc",
   "metadata": {},
   "source": [
    "# 交易明细文本特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "1cf49376-6533-4b60-832b-6ff1aff9d630",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-11T02:59:39.435312Z",
     "iopub.status.busy": "2024-11-11T02:59:39.434993Z",
     "iopub.status.idle": "2024-11-11T02:59:39.441607Z",
     "msg_id": "273a4372-b204-4af9-bc6f-f7f7bb44ecd0",
     "shell.execute_reply": "2024-11-11T02:59:39.440887Z",
     "shell.execute_reply.started": "2024-11-11T02:59:39.435283Z"
    }
   },
   "outputs": [],
   "source": [
    "def FNCL_TR_DTAL_text():\n",
    "    file_name = 'XW_ENTINFO_FNCL_TR_DTAL'\n",
    "    FNCL_TR_DTAL = get_data(file_name, num_rows=None)\n",
    "    file_name = 'XW_ENTINFO_TARGET'\n",
    "    TARGET = get_data(file_name, num_rows=None)\n",
    "    TARGET = TARGET.drop(['数据日期'], axis = 1)\n",
    "    for i in ['交易代码','渠道代码','摘要信息', '交易对手客户编号']:\n",
    "        tmp = text_feats(FNCL_TR_DTAL, '客户编号', i, num=10)\n",
    "        TARGET = TARGET.merge(tmp, on = '客户编号', how = 'left')\n",
    "    for i in ['交易代码','渠道代码','摘要信息', '交易对手客户编号']:\n",
    "        tmp = word2vec_feature(FNCL_TR_DTAL, '客户编号', i, ext=\"A\")\n",
    "        TARGET = TARGET.merge(tmp, on = '客户编号', how = 'left')\n",
    "    TARGET = TARGET.drop(['is_train', 'FLAG'], axis = 1)\n",
    "    return TARGET"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "efb3a32c-e381-41e9-9302-ce8d5143bfab",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-11T02:59:39.444572Z",
     "iopub.status.busy": "2024-11-11T02:59:39.444166Z",
     "iopub.status.idle": "2024-11-11T03:19:35.349622Z",
     "msg_id": "088e44d2-fac1-4792-ae67-e8d997fe83fe",
     "shell.execute_reply": "2024-11-11T03:19:35.348925Z",
     "shell.execute_reply.started": "2024-11-11T02:59:39.444546Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(59079, 113)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>客户编号</th>\n",
       "      <th>交易代码_tfidf_0</th>\n",
       "      <th>交易代码_tfidf_1</th>\n",
       "      <th>交易代码_tfidf_2</th>\n",
       "      <th>交易代码_tfidf_3</th>\n",
       "      <th>交易代码_tfidf_4</th>\n",
       "      <th>交易代码_tfidf_5</th>\n",
       "      <th>交易代码_tfidf_6</th>\n",
       "      <th>交易代码_tfidf_7</th>\n",
       "      <th>交易代码_tfidf_8</th>\n",
       "      <th>交易代码_tfidf_9</th>\n",
       "      <th>交易代码_countvec_0</th>\n",
       "      <th>交易代码_countvec_1</th>\n",
       "      <th>交易代码_countvec_2</th>\n",
       "      <th>交易代码_countvec_3</th>\n",
       "      <th>交易代码_countvec_4</th>\n",
       "      <th>交易代码_countvec_5</th>\n",
       "      <th>交易代码_countvec_6</th>\n",
       "      <th>交易代码_countvec_7</th>\n",
       "      <th>交易代码_countvec_8</th>\n",
       "      <th>交易代码_countvec_9</th>\n",
       "      <th>渠道代码_tfidf_0</th>\n",
       "      <th>渠道代码_tfidf_1</th>\n",
       "      <th>渠道代码_tfidf_2</th>\n",
       "      <th>渠道代码_tfidf_3</th>\n",
       "      <th>渠道代码_tfidf_4</th>\n",
       "      <th>渠道代码_tfidf_5</th>\n",
       "      <th>渠道代码_tfidf_6</th>\n",
       "      <th>渠道代码_tfidf_7</th>\n",
       "      <th>渠道代码_tfidf_8</th>\n",
       "      <th>渠道代码_tfidf_9</th>\n",
       "      <th>渠道代码_countvec_0</th>\n",
       "      <th>渠道代码_countvec_1</th>\n",
       "      <th>渠道代码_countvec_2</th>\n",
       "      <th>渠道代码_countvec_3</th>\n",
       "      <th>渠道代码_countvec_4</th>\n",
       "      <th>渠道代码_countvec_5</th>\n",
       "      <th>渠道代码_countvec_6</th>\n",
       "      <th>渠道代码_countvec_7</th>\n",
       "      <th>渠道代码_countvec_8</th>\n",
       "      <th>渠道代码_countvec_9</th>\n",
       "      <th>摘要信息_tfidf_0</th>\n",
       "      <th>摘要信息_tfidf_1</th>\n",
       "      <th>摘要信息_tfidf_2</th>\n",
       "      <th>摘要信息_tfidf_3</th>\n",
       "      <th>摘要信息_tfidf_4</th>\n",
       "      <th>摘要信息_tfidf_5</th>\n",
       "      <th>摘要信息_tfidf_6</th>\n",
       "      <th>摘要信息_tfidf_7</th>\n",
       "      <th>摘要信息_tfidf_8</th>\n",
       "      <th>摘要信息_tfidf_9</th>\n",
       "      <th>摘要信息_countvec_0</th>\n",
       "      <th>摘要信息_countvec_1</th>\n",
       "      <th>摘要信息_countvec_2</th>\n",
       "      <th>摘要信息_countvec_3</th>\n",
       "      <th>摘要信息_countvec_4</th>\n",
       "      <th>摘要信息_countvec_5</th>\n",
       "      <th>摘要信息_countvec_6</th>\n",
       "      <th>摘要信息_countvec_7</th>\n",
       "      <th>摘要信息_countvec_8</th>\n",
       "      <th>摘要信息_countvec_9</th>\n",
       "      <th>交易对手客户编号_tfidf_0</th>\n",
       "      <th>交易对手客户编号_tfidf_1</th>\n",
       "      <th>交易对手客户编号_tfidf_2</th>\n",
       "      <th>交易对手客户编号_tfidf_3</th>\n",
       "      <th>交易对手客户编号_tfidf_4</th>\n",
       "      <th>交易对手客户编号_tfidf_5</th>\n",
       "      <th>交易对手客户编号_tfidf_6</th>\n",
       "      <th>交易对手客户编号_tfidf_7</th>\n",
       "      <th>交易对手客户编号_tfidf_8</th>\n",
       "      <th>交易对手客户编号_tfidf_9</th>\n",
       "      <th>交易对手客户编号_countvec_0</th>\n",
       "      <th>交易对手客户编号_countvec_1</th>\n",
       "      <th>交易对手客户编号_countvec_2</th>\n",
       "      <th>交易对手客户编号_countvec_3</th>\n",
       "      <th>交易对手客户编号_countvec_4</th>\n",
       "      <th>交易对手客户编号_countvec_5</th>\n",
       "      <th>交易对手客户编号_countvec_6</th>\n",
       "      <th>交易对手客户编号_countvec_7</th>\n",
       "      <th>交易对手客户编号_countvec_8</th>\n",
       "      <th>交易对手客户编号_countvec_9</th>\n",
       "      <th>客户编号_交易代码_w2v_0</th>\n",
       "      <th>客户编号_交易代码_w2v_1</th>\n",
       "      <th>客户编号_交易代码_w2v_2</th>\n",
       "      <th>客户编号_交易代码_w2v_3</th>\n",
       "      <th>客户编号_交易代码_w2v_4</th>\n",
       "      <th>客户编号_交易代码_w2v_5</th>\n",
       "      <th>客户编号_交易代码_w2v_6</th>\n",
       "      <th>客户编号_交易代码_w2v_7</th>\n",
       "      <th>客户编号_渠道代码_w2v_0</th>\n",
       "      <th>客户编号_渠道代码_w2v_1</th>\n",
       "      <th>客户编号_渠道代码_w2v_2</th>\n",
       "      <th>客户编号_渠道代码_w2v_3</th>\n",
       "      <th>客户编号_渠道代码_w2v_4</th>\n",
       "      <th>客户编号_渠道代码_w2v_5</th>\n",
       "      <th>客户编号_渠道代码_w2v_6</th>\n",
       "      <th>客户编号_渠道代码_w2v_7</th>\n",
       "      <th>客户编号_摘要信息_w2v_0</th>\n",
       "      <th>客户编号_摘要信息_w2v_1</th>\n",
       "      <th>客户编号_摘要信息_w2v_2</th>\n",
       "      <th>客户编号_摘要信息_w2v_3</th>\n",
       "      <th>客户编号_摘要信息_w2v_4</th>\n",
       "      <th>客户编号_摘要信息_w2v_5</th>\n",
       "      <th>客户编号_摘要信息_w2v_6</th>\n",
       "      <th>客户编号_摘要信息_w2v_7</th>\n",
       "      <th>客户编号_交易对手客户编号_w2v_0</th>\n",
       "      <th>客户编号_交易对手客户编号_w2v_1</th>\n",
       "      <th>客户编号_交易对手客户编号_w2v_2</th>\n",
       "      <th>客户编号_交易对手客户编号_w2v_3</th>\n",
       "      <th>客户编号_交易对手客户编号_w2v_4</th>\n",
       "      <th>客户编号_交易对手客户编号_w2v_5</th>\n",
       "      <th>客户编号_交易对手客户编号_w2v_6</th>\n",
       "      <th>客户编号_交易对手客户编号_w2v_7</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>158a8d99bec2a2b652a6de45a2b52ec9</td>\n",
       "      <td>0.81109</td>\n",
       "      <td>-0.159784</td>\n",
       "      <td>-0.092681</td>\n",
       "      <td>-0.093322</td>\n",
       "      <td>0.218147</td>\n",
       "      <td>-0.007889</td>\n",
       "      <td>-0.054189</td>\n",
       "      <td>-0.011748</td>\n",
       "      <td>-0.074365</td>\n",
       "      <td>0.063339</td>\n",
       "      <td>2.019542</td>\n",
       "      <td>-0.889053</td>\n",
       "      <td>4.912235</td>\n",
       "      <td>-1.589329</td>\n",
       "      <td>-0.261172</td>\n",
       "      <td>3.24468</td>\n",
       "      <td>-0.144627</td>\n",
       "      <td>-4.097784</td>\n",
       "      <td>-11.070854</td>\n",
       "      <td>-1.099097</td>\n",
       "      <td>0.890489</td>\n",
       "      <td>-0.107143</td>\n",
       "      <td>0.329053</td>\n",
       "      <td>-0.063001</td>\n",
       "      <td>-0.021929</td>\n",
       "      <td>0.00111</td>\n",
       "      <td>0.017907</td>\n",
       "      <td>-0.081175</td>\n",
       "      <td>-0.034756</td>\n",
       "      <td>-0.000329</td>\n",
       "      <td>7.85908</td>\n",
       "      <td>0.145436</td>\n",
       "      <td>19.245443</td>\n",
       "      <td>1.466869</td>\n",
       "      <td>-3.416739</td>\n",
       "      <td>-1.930486</td>\n",
       "      <td>-0.374106</td>\n",
       "      <td>-0.109338</td>\n",
       "      <td>-0.079323</td>\n",
       "      <td>-0.612251</td>\n",
       "      <td>0.54559</td>\n",
       "      <td>-0.020252</td>\n",
       "      <td>-0.058134</td>\n",
       "      <td>-0.024533</td>\n",
       "      <td>-0.004371</td>\n",
       "      <td>0.007319</td>\n",
       "      <td>-0.009165</td>\n",
       "      <td>-0.009935</td>\n",
       "      <td>-0.010712</td>\n",
       "      <td>-0.013387</td>\n",
       "      <td>23.004029</td>\n",
       "      <td>0.920446</td>\n",
       "      <td>-0.020211</td>\n",
       "      <td>-0.002396</td>\n",
       "      <td>-0.00033</td>\n",
       "      <td>-0.00434</td>\n",
       "      <td>-0.001517</td>\n",
       "      <td>-0.000785</td>\n",
       "      <td>0.004403</td>\n",
       "      <td>-0.006982</td>\n",
       "      <td>0.884758</td>\n",
       "      <td>-0.209479</td>\n",
       "      <td>-0.005232</td>\n",
       "      <td>-0.0003</td>\n",
       "      <td>-0.000148</td>\n",
       "      <td>-0.000956</td>\n",
       "      <td>-0.000188</td>\n",
       "      <td>-0.000054</td>\n",
       "      <td>0.000012</td>\n",
       "      <td>-0.001242</td>\n",
       "      <td>0.229744</td>\n",
       "      <td>13.957486</td>\n",
       "      <td>20.191288</td>\n",
       "      <td>-1.092272</td>\n",
       "      <td>0.229367</td>\n",
       "      <td>-0.040072</td>\n",
       "      <td>-0.164456</td>\n",
       "      <td>-0.010378</td>\n",
       "      <td>0.011442</td>\n",
       "      <td>-0.001079</td>\n",
       "      <td>-0.819428</td>\n",
       "      <td>-0.073411</td>\n",
       "      <td>-0.349541</td>\n",
       "      <td>-0.372124</td>\n",
       "      <td>-0.384011</td>\n",
       "      <td>-0.285305</td>\n",
       "      <td>0.314123</td>\n",
       "      <td>-0.695355</td>\n",
       "      <td>-0.830398</td>\n",
       "      <td>-0.187385</td>\n",
       "      <td>0.527071</td>\n",
       "      <td>0.459128</td>\n",
       "      <td>0.457008</td>\n",
       "      <td>0.509866</td>\n",
       "      <td>0.061953</td>\n",
       "      <td>-0.430103</td>\n",
       "      <td>-0.327669</td>\n",
       "      <td>-0.084095</td>\n",
       "      <td>1.435355</td>\n",
       "      <td>1.051913</td>\n",
       "      <td>0.641961</td>\n",
       "      <td>0.656063</td>\n",
       "      <td>-4.134728</td>\n",
       "      <td>0.782936</td>\n",
       "      <td>0.261499</td>\n",
       "      <td>-0.983913</td>\n",
       "      <td>2.340259</td>\n",
       "      <td>0.130165</td>\n",
       "      <td>0.516862</td>\n",
       "      <td>0.147036</td>\n",
       "      <td>-0.565741</td>\n",
       "      <td>1.807155</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               客户编号  交易代码_tfidf_0  交易代码_tfidf_1  交易代码_tfidf_2  \\\n",
       "0  158a8d99bec2a2b652a6de45a2b52ec9       0.81109     -0.159784     -0.092681   \n",
       "\n",
       "   交易代码_tfidf_3  交易代码_tfidf_4  交易代码_tfidf_5  交易代码_tfidf_6  交易代码_tfidf_7  \\\n",
       "0     -0.093322      0.218147     -0.007889     -0.054189     -0.011748   \n",
       "\n",
       "   交易代码_tfidf_8  交易代码_tfidf_9  交易代码_countvec_0  交易代码_countvec_1  \\\n",
       "0     -0.074365      0.063339         2.019542        -0.889053   \n",
       "\n",
       "   交易代码_countvec_2  交易代码_countvec_3  交易代码_countvec_4  交易代码_countvec_5  \\\n",
       "0         4.912235        -1.589329        -0.261172          3.24468   \n",
       "\n",
       "   交易代码_countvec_6  交易代码_countvec_7  交易代码_countvec_8  交易代码_countvec_9  \\\n",
       "0        -0.144627        -4.097784       -11.070854        -1.099097   \n",
       "\n",
       "   渠道代码_tfidf_0  渠道代码_tfidf_1  渠道代码_tfidf_2  渠道代码_tfidf_3  渠道代码_tfidf_4  \\\n",
       "0      0.890489     -0.107143      0.329053     -0.063001     -0.021929   \n",
       "\n",
       "   渠道代码_tfidf_5  渠道代码_tfidf_6  渠道代码_tfidf_7  渠道代码_tfidf_8  渠道代码_tfidf_9  \\\n",
       "0       0.00111      0.017907     -0.081175     -0.034756     -0.000329   \n",
       "\n",
       "   渠道代码_countvec_0  渠道代码_countvec_1  渠道代码_countvec_2  渠道代码_countvec_3  \\\n",
       "0          7.85908         0.145436        19.245443         1.466869   \n",
       "\n",
       "   渠道代码_countvec_4  渠道代码_countvec_5  渠道代码_countvec_6  渠道代码_countvec_7  \\\n",
       "0        -3.416739        -1.930486        -0.374106        -0.109338   \n",
       "\n",
       "   渠道代码_countvec_8  渠道代码_countvec_9  摘要信息_tfidf_0  摘要信息_tfidf_1  摘要信息_tfidf_2  \\\n",
       "0        -0.079323        -0.612251       0.54559     -0.020252     -0.058134   \n",
       "\n",
       "   摘要信息_tfidf_3  摘要信息_tfidf_4  摘要信息_tfidf_5  摘要信息_tfidf_6  摘要信息_tfidf_7  \\\n",
       "0     -0.024533     -0.004371      0.007319     -0.009165     -0.009935   \n",
       "\n",
       "   摘要信息_tfidf_8  摘要信息_tfidf_9  摘要信息_countvec_0  摘要信息_countvec_1  \\\n",
       "0     -0.010712     -0.013387        23.004029         0.920446   \n",
       "\n",
       "   摘要信息_countvec_2  摘要信息_countvec_3  摘要信息_countvec_4  摘要信息_countvec_5  \\\n",
       "0        -0.020211        -0.002396         -0.00033         -0.00434   \n",
       "\n",
       "   摘要信息_countvec_6  摘要信息_countvec_7  摘要信息_countvec_8  摘要信息_countvec_9  \\\n",
       "0        -0.001517        -0.000785         0.004403        -0.006982   \n",
       "\n",
       "   交易对手客户编号_tfidf_0  交易对手客户编号_tfidf_1  交易对手客户编号_tfidf_2  交易对手客户编号_tfidf_3  \\\n",
       "0          0.884758         -0.209479         -0.005232           -0.0003   \n",
       "\n",
       "   交易对手客户编号_tfidf_4  交易对手客户编号_tfidf_5  交易对手客户编号_tfidf_6  交易对手客户编号_tfidf_7  \\\n",
       "0         -0.000148         -0.000956         -0.000188         -0.000054   \n",
       "\n",
       "   交易对手客户编号_tfidf_8  交易对手客户编号_tfidf_9  交易对手客户编号_countvec_0  \\\n",
       "0          0.000012         -0.001242             0.229744   \n",
       "\n",
       "   交易对手客户编号_countvec_1  交易对手客户编号_countvec_2  交易对手客户编号_countvec_3  \\\n",
       "0            13.957486            20.191288            -1.092272   \n",
       "\n",
       "   交易对手客户编号_countvec_4  交易对手客户编号_countvec_5  交易对手客户编号_countvec_6  \\\n",
       "0             0.229367            -0.040072            -0.164456   \n",
       "\n",
       "   交易对手客户编号_countvec_7  交易对手客户编号_countvec_8  交易对手客户编号_countvec_9  \\\n",
       "0            -0.010378             0.011442            -0.001079   \n",
       "\n",
       "   客户编号_交易代码_w2v_0  客户编号_交易代码_w2v_1  客户编号_交易代码_w2v_2  客户编号_交易代码_w2v_3  \\\n",
       "0        -0.819428        -0.073411        -0.349541        -0.372124   \n",
       "\n",
       "   客户编号_交易代码_w2v_4  客户编号_交易代码_w2v_5  客户编号_交易代码_w2v_6  客户编号_交易代码_w2v_7  \\\n",
       "0        -0.384011        -0.285305         0.314123        -0.695355   \n",
       "\n",
       "   客户编号_渠道代码_w2v_0  客户编号_渠道代码_w2v_1  客户编号_渠道代码_w2v_2  客户编号_渠道代码_w2v_3  \\\n",
       "0        -0.830398        -0.187385         0.527071         0.459128   \n",
       "\n",
       "   客户编号_渠道代码_w2v_4  客户编号_渠道代码_w2v_5  客户编号_渠道代码_w2v_6  客户编号_渠道代码_w2v_7  \\\n",
       "0         0.457008         0.509866         0.061953        -0.430103   \n",
       "\n",
       "   客户编号_摘要信息_w2v_0  客户编号_摘要信息_w2v_1  客户编号_摘要信息_w2v_2  客户编号_摘要信息_w2v_3  \\\n",
       "0        -0.327669        -0.084095         1.435355         1.051913   \n",
       "\n",
       "   客户编号_摘要信息_w2v_4  客户编号_摘要信息_w2v_5  客户编号_摘要信息_w2v_6  客户编号_摘要信息_w2v_7  \\\n",
       "0         0.641961         0.656063        -4.134728         0.782936   \n",
       "\n",
       "   客户编号_交易对手客户编号_w2v_0  客户编号_交易对手客户编号_w2v_1  客户编号_交易对手客户编号_w2v_2  \\\n",
       "0             0.261499            -0.983913             2.340259   \n",
       "\n",
       "   客户编号_交易对手客户编号_w2v_3  客户编号_交易对手客户编号_w2v_4  客户编号_交易对手客户编号_w2v_5  \\\n",
       "0             0.130165             0.516862             0.147036   \n",
       "\n",
       "   客户编号_交易对手客户编号_w2v_6  客户编号_交易对手客户编号_w2v_7  \n",
       "0            -0.565741             1.807155  "
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fncl_tr_dtal_text = FNCL_TR_DTAL_text()\n",
    "print(fncl_tr_dtal_text.shape)\n",
    "fncl_tr_dtal_text.head(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "106c9599-1a27-4ccc-b435-18df061530f1",
   "metadata": {},
   "source": [
    "# 四张特征表聚合并输出文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "e36e791e-a214-4102-8835-1a7e67828ed3",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-11T03:19:35.350989Z",
     "iopub.status.busy": "2024-11-11T03:19:35.350627Z",
     "iopub.status.idle": "2024-11-11T03:19:36.139900Z",
     "msg_id": "8d07993f-fc19-40a9-aaff-002956effaf8",
     "shell.execute_reply": "2024-11-11T03:19:36.139228Z",
     "shell.execute_reply.started": "2024-11-11T03:19:35.350961Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(59079, 355)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>客户编号</th>\n",
       "      <th>注册资本</th>\n",
       "      <th>是否长期经营</th>\n",
       "      <th>经营成立时间是否相等</th>\n",
       "      <th>注册资金过小</th>\n",
       "      <th>经营是否已过期</th>\n",
       "      <th>剩余经营天数_天</th>\n",
       "      <th>已经营天数_天</th>\n",
       "      <th>当期经营期限总天数_天</th>\n",
       "      <th>自成立经营期限总天数_天</th>\n",
       "      <th>已成立天数_天</th>\n",
       "      <th>再次经营_天</th>\n",
       "      <th>剩余经营天数_月</th>\n",
       "      <th>已经营天数_月</th>\n",
       "      <th>当期经营期限总天数_月</th>\n",
       "      <th>自成立经营期限总天数_月</th>\n",
       "      <th>已成立天数_月</th>\n",
       "      <th>再次经营_月</th>\n",
       "      <th>剩余经营天数_年</th>\n",
       "      <th>已经营天数_年</th>\n",
       "      <th>当期经营期限总天数_年</th>\n",
       "      <th>自成立经营期限总天数_年</th>\n",
       "      <th>已成立天数_年</th>\n",
       "      <th>再次经营_年</th>\n",
       "      <th>经营状态_编码</th>\n",
       "      <th>企业（机构）类型编码_频数是否前10</th>\n",
       "      <th>企业（机构）类型编码_频数是否后20</th>\n",
       "      <th>企业（机构）类型编码_是否频数最高2类</th>\n",
       "      <th>所在省份编码_频数是否前5</th>\n",
       "      <th>所在省份编码_频数是否后5</th>\n",
       "      <th>企业（机构）类型编码_是否坏率最高2类</th>\n",
       "      <th>国民经济行业代码_频数是否前5</th>\n",
       "      <th>国民经济行业代码_频数是否后230</th>\n",
       "      <th>企业（机构）类型编码_分箱</th>\n",
       "      <th>所在省份编码_分箱</th>\n",
       "      <th>国民经济行业代码_分箱</th>\n",
       "      <th>法定代表人相关企业个数</th>\n",
       "      <th>法人涉足企业类型</th>\n",
       "      <th>法人涉足国民经济行业代码</th>\n",
       "      <th>法人跨省个数</th>\n",
       "      <th>法定代表人_tfidf_0</th>\n",
       "      <th>法定代表人_tfidf_1</th>\n",
       "      <th>法定代表人_tfidf_2</th>\n",
       "      <th>法定代表人_tfidf_3</th>\n",
       "      <th>法定代表人_tfidf_4</th>\n",
       "      <th>法定代表人_tfidf_5</th>\n",
       "      <th>法定代表人_tfidf_6</th>\n",
       "      <th>法定代表人_tfidf_7</th>\n",
       "      <th>法定代表人_tfidf_8</th>\n",
       "      <th>法定代表人_tfidf_9</th>\n",
       "      <th>法定代表人_countvec_0</th>\n",
       "      <th>法定代表人_countvec_1</th>\n",
       "      <th>法定代表人_countvec_2</th>\n",
       "      <th>法定代表人_countvec_3</th>\n",
       "      <th>法定代表人_countvec_4</th>\n",
       "      <th>法定代表人_countvec_5</th>\n",
       "      <th>法定代表人_countvec_6</th>\n",
       "      <th>法定代表人_countvec_7</th>\n",
       "      <th>法定代表人_countvec_8</th>\n",
       "      <th>法定代表人_countvec_9</th>\n",
       "      <th>企业（机构）类型编码_tfidf_0</th>\n",
       "      <th>企业（机构）类型编码_tfidf_1</th>\n",
       "      <th>企业（机构）类型编码_tfidf_2</th>\n",
       "      <th>企业（机构）类型编码_tfidf_3</th>\n",
       "      <th>企业（机构）类型编码_tfidf_4</th>\n",
       "      <th>企业（机构）类型编码_tfidf_5</th>\n",
       "      <th>企业（机构）类型编码_tfidf_6</th>\n",
       "      <th>企业（机构）类型编码_tfidf_7</th>\n",
       "      <th>企业（机构）类型编码_tfidf_8</th>\n",
       "      <th>企业（机构）类型编码_tfidf_9</th>\n",
       "      <th>企业（机构）类型编码_countvec_0</th>\n",
       "      <th>企业（机构）类型编码_countvec_1</th>\n",
       "      <th>企业（机构）类型编码_countvec_2</th>\n",
       "      <th>企业（机构）类型编码_countvec_3</th>\n",
       "      <th>企业（机构）类型编码_countvec_4</th>\n",
       "      <th>企业（机构）类型编码_countvec_5</th>\n",
       "      <th>企业（机构）类型编码_countvec_6</th>\n",
       "      <th>企业（机构）类型编码_countvec_7</th>\n",
       "      <th>企业（机构）类型编码_countvec_8</th>\n",
       "      <th>企业（机构）类型编码_countvec_9</th>\n",
       "      <th>所在省份编码_tfidf_0</th>\n",
       "      <th>所在省份编码_tfidf_1</th>\n",
       "      <th>所在省份编码_tfidf_2</th>\n",
       "      <th>所在省份编码_tfidf_3</th>\n",
       "      <th>所在省份编码_tfidf_4</th>\n",
       "      <th>所在省份编码_tfidf_5</th>\n",
       "      <th>所在省份编码_tfidf_6</th>\n",
       "      <th>所在省份编码_tfidf_7</th>\n",
       "      <th>所在省份编码_tfidf_8</th>\n",
       "      <th>所在省份编码_tfidf_9</th>\n",
       "      <th>所在省份编码_countvec_0</th>\n",
       "      <th>所在省份编码_countvec_1</th>\n",
       "      <th>所在省份编码_countvec_2</th>\n",
       "      <th>所在省份编码_countvec_3</th>\n",
       "      <th>所在省份编码_countvec_4</th>\n",
       "      <th>所在省份编码_countvec_5</th>\n",
       "      <th>所在省份编码_countvec_6</th>\n",
       "      <th>所在省份编码_countvec_7</th>\n",
       "      <th>所在省份编码_countvec_8</th>\n",
       "      <th>所在省份编码_countvec_9</th>\n",
       "      <th>...</th>\n",
       "      <th>交易代码_countvec_2</th>\n",
       "      <th>交易代码_countvec_3</th>\n",
       "      <th>交易代码_countvec_4</th>\n",
       "      <th>交易代码_countvec_5</th>\n",
       "      <th>交易代码_countvec_6</th>\n",
       "      <th>交易代码_countvec_7</th>\n",
       "      <th>交易代码_countvec_8</th>\n",
       "      <th>交易代码_countvec_9</th>\n",
       "      <th>渠道代码_tfidf_0</th>\n",
       "      <th>渠道代码_tfidf_1</th>\n",
       "      <th>渠道代码_tfidf_2</th>\n",
       "      <th>渠道代码_tfidf_3</th>\n",
       "      <th>渠道代码_tfidf_4</th>\n",
       "      <th>渠道代码_tfidf_5</th>\n",
       "      <th>渠道代码_tfidf_6</th>\n",
       "      <th>渠道代码_tfidf_7</th>\n",
       "      <th>渠道代码_tfidf_8</th>\n",
       "      <th>渠道代码_tfidf_9</th>\n",
       "      <th>渠道代码_countvec_0</th>\n",
       "      <th>渠道代码_countvec_1</th>\n",
       "      <th>渠道代码_countvec_2</th>\n",
       "      <th>渠道代码_countvec_3</th>\n",
       "      <th>渠道代码_countvec_4</th>\n",
       "      <th>渠道代码_countvec_5</th>\n",
       "      <th>渠道代码_countvec_6</th>\n",
       "      <th>渠道代码_countvec_7</th>\n",
       "      <th>渠道代码_countvec_8</th>\n",
       "      <th>渠道代码_countvec_9</th>\n",
       "      <th>摘要信息_tfidf_0</th>\n",
       "      <th>摘要信息_tfidf_1</th>\n",
       "      <th>摘要信息_tfidf_2</th>\n",
       "      <th>摘要信息_tfidf_3</th>\n",
       "      <th>摘要信息_tfidf_4</th>\n",
       "      <th>摘要信息_tfidf_5</th>\n",
       "      <th>摘要信息_tfidf_6</th>\n",
       "      <th>摘要信息_tfidf_7</th>\n",
       "      <th>摘要信息_tfidf_8</th>\n",
       "      <th>摘要信息_tfidf_9</th>\n",
       "      <th>摘要信息_countvec_0</th>\n",
       "      <th>摘要信息_countvec_1</th>\n",
       "      <th>摘要信息_countvec_2</th>\n",
       "      <th>摘要信息_countvec_3</th>\n",
       "      <th>摘要信息_countvec_4</th>\n",
       "      <th>摘要信息_countvec_5</th>\n",
       "      <th>摘要信息_countvec_6</th>\n",
       "      <th>摘要信息_countvec_7</th>\n",
       "      <th>摘要信息_countvec_8</th>\n",
       "      <th>摘要信息_countvec_9</th>\n",
       "      <th>交易对手客户编号_tfidf_0</th>\n",
       "      <th>交易对手客户编号_tfidf_1</th>\n",
       "      <th>交易对手客户编号_tfidf_2</th>\n",
       "      <th>交易对手客户编号_tfidf_3</th>\n",
       "      <th>交易对手客户编号_tfidf_4</th>\n",
       "      <th>交易对手客户编号_tfidf_5</th>\n",
       "      <th>交易对手客户编号_tfidf_6</th>\n",
       "      <th>交易对手客户编号_tfidf_7</th>\n",
       "      <th>交易对手客户编号_tfidf_8</th>\n",
       "      <th>交易对手客户编号_tfidf_9</th>\n",
       "      <th>交易对手客户编号_countvec_0</th>\n",
       "      <th>交易对手客户编号_countvec_1</th>\n",
       "      <th>交易对手客户编号_countvec_2</th>\n",
       "      <th>交易对手客户编号_countvec_3</th>\n",
       "      <th>交易对手客户编号_countvec_4</th>\n",
       "      <th>交易对手客户编号_countvec_5</th>\n",
       "      <th>交易对手客户编号_countvec_6</th>\n",
       "      <th>交易对手客户编号_countvec_7</th>\n",
       "      <th>交易对手客户编号_countvec_8</th>\n",
       "      <th>交易对手客户编号_countvec_9</th>\n",
       "      <th>客户编号_交易代码_w2v_0</th>\n",
       "      <th>客户编号_交易代码_w2v_1</th>\n",
       "      <th>客户编号_交易代码_w2v_2</th>\n",
       "      <th>客户编号_交易代码_w2v_3</th>\n",
       "      <th>客户编号_交易代码_w2v_4</th>\n",
       "      <th>客户编号_交易代码_w2v_5</th>\n",
       "      <th>客户编号_交易代码_w2v_6</th>\n",
       "      <th>客户编号_交易代码_w2v_7</th>\n",
       "      <th>客户编号_渠道代码_w2v_0</th>\n",
       "      <th>客户编号_渠道代码_w2v_1</th>\n",
       "      <th>客户编号_渠道代码_w2v_2</th>\n",
       "      <th>客户编号_渠道代码_w2v_3</th>\n",
       "      <th>客户编号_渠道代码_w2v_4</th>\n",
       "      <th>客户编号_渠道代码_w2v_5</th>\n",
       "      <th>客户编号_渠道代码_w2v_6</th>\n",
       "      <th>客户编号_渠道代码_w2v_7</th>\n",
       "      <th>客户编号_摘要信息_w2v_0</th>\n",
       "      <th>客户编号_摘要信息_w2v_1</th>\n",
       "      <th>客户编号_摘要信息_w2v_2</th>\n",
       "      <th>客户编号_摘要信息_w2v_3</th>\n",
       "      <th>客户编号_摘要信息_w2v_4</th>\n",
       "      <th>客户编号_摘要信息_w2v_5</th>\n",
       "      <th>客户编号_摘要信息_w2v_6</th>\n",
       "      <th>客户编号_摘要信息_w2v_7</th>\n",
       "      <th>客户编号_交易对手客户编号_w2v_0</th>\n",
       "      <th>客户编号_交易对手客户编号_w2v_1</th>\n",
       "      <th>客户编号_交易对手客户编号_w2v_2</th>\n",
       "      <th>客户编号_交易对手客户编号_w2v_3</th>\n",
       "      <th>客户编号_交易对手客户编号_w2v_4</th>\n",
       "      <th>客户编号_交易对手客户编号_w2v_5</th>\n",
       "      <th>客户编号_交易对手客户编号_w2v_6</th>\n",
       "      <th>客户编号_交易对手客户编号_w2v_7</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>182d6a854532dd26a1b111e77bd501f4</td>\n",
       "      <td>690521.61</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>14574</td>\n",
       "      <td>3675</td>\n",
       "      <td>18249</td>\n",
       "      <td>18249</td>\n",
       "      <td>3675</td>\n",
       "      <td>0</td>\n",
       "      <td>480</td>\n",
       "      <td>120</td>\n",
       "      <td>600</td>\n",
       "      <td>600</td>\n",
       "      <td>120</td>\n",
       "      <td>0</td>\n",
       "      <td>40.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>50.0</td>\n",
       "      <td>50.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.102424e-33</td>\n",
       "      <td>2.667420e-33</td>\n",
       "      <td>3.992042e-32</td>\n",
       "      <td>8.008220e-31</td>\n",
       "      <td>-3.629886e-31</td>\n",
       "      <td>-9.217419e-31</td>\n",
       "      <td>-1.847758e-30</td>\n",
       "      <td>-1.470448e-29</td>\n",
       "      <td>6.370142e-30</td>\n",
       "      <td>2.460111e-29</td>\n",
       "      <td>-1.102424e-33</td>\n",
       "      <td>2.667420e-33</td>\n",
       "      <td>3.992042e-32</td>\n",
       "      <td>8.008220e-31</td>\n",
       "      <td>-3.629886e-31</td>\n",
       "      <td>-9.217419e-31</td>\n",
       "      <td>-1.847758e-30</td>\n",
       "      <td>-1.470448e-29</td>\n",
       "      <td>6.370142e-30</td>\n",
       "      <td>2.460111e-29</td>\n",
       "      <td>1.0</td>\n",
       "      <td>-7.412313e-18</td>\n",
       "      <td>-2.602485e-21</td>\n",
       "      <td>-8.368757e-29</td>\n",
       "      <td>9.075137e-33</td>\n",
       "      <td>9.408616e-35</td>\n",
       "      <td>-2.943408e-33</td>\n",
       "      <td>-2.544040e-33</td>\n",
       "      <td>-1.943142e-37</td>\n",
       "      <td>9.602446e-37</td>\n",
       "      <td>1.0</td>\n",
       "      <td>-7.412313e-18</td>\n",
       "      <td>-2.602485e-21</td>\n",
       "      <td>-8.368757e-29</td>\n",
       "      <td>9.075137e-33</td>\n",
       "      <td>9.408616e-35</td>\n",
       "      <td>-2.943408e-33</td>\n",
       "      <td>-2.544040e-33</td>\n",
       "      <td>-1.943142e-37</td>\n",
       "      <td>9.602446e-37</td>\n",
       "      <td>1.647601e-17</td>\n",
       "      <td>9.703892e-18</td>\n",
       "      <td>-5.928742e-19</td>\n",
       "      <td>5.131871e-17</td>\n",
       "      <td>-5.831571e-16</td>\n",
       "      <td>-1.076496e-15</td>\n",
       "      <td>1.798985e-14</td>\n",
       "      <td>4.988046e-13</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.458765e-14</td>\n",
       "      <td>1.647601e-17</td>\n",
       "      <td>9.703892e-18</td>\n",
       "      <td>-5.928742e-19</td>\n",
       "      <td>5.131871e-17</td>\n",
       "      <td>-5.831571e-16</td>\n",
       "      <td>-1.076496e-15</td>\n",
       "      <td>1.798985e-14</td>\n",
       "      <td>4.988046e-13</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.458765e-14</td>\n",
       "      <td>...</td>\n",
       "      <td>173.537591</td>\n",
       "      <td>-55.911452</td>\n",
       "      <td>-14.67825</td>\n",
       "      <td>3.638379</td>\n",
       "      <td>-0.181371</td>\n",
       "      <td>-12.794177</td>\n",
       "      <td>-67.592688</td>\n",
       "      <td>-6.488669</td>\n",
       "      <td>0.663855</td>\n",
       "      <td>0.6649</td>\n",
       "      <td>-0.079086</td>\n",
       "      <td>-0.164614</td>\n",
       "      <td>-0.012042</td>\n",
       "      <td>-0.034964</td>\n",
       "      <td>0.111919</td>\n",
       "      <td>0.016231</td>\n",
       "      <td>-0.041622</td>\n",
       "      <td>0.004708</td>\n",
       "      <td>366.858363</td>\n",
       "      <td>-104.664875</td>\n",
       "      <td>124.010924</td>\n",
       "      <td>-86.75876</td>\n",
       "      <td>-22.611876</td>\n",
       "      <td>-66.351759</td>\n",
       "      <td>-2.308754</td>\n",
       "      <td>-2.319975</td>\n",
       "      <td>-6.69274</td>\n",
       "      <td>-1.620721</td>\n",
       "      <td>0.450894</td>\n",
       "      <td>0.677956</td>\n",
       "      <td>-0.053542</td>\n",
       "      <td>-0.070444</td>\n",
       "      <td>0.056371</td>\n",
       "      <td>0.006496</td>\n",
       "      <td>-0.012164</td>\n",
       "      <td>-0.000835</td>\n",
       "      <td>-0.023478</td>\n",
       "      <td>-0.021109</td>\n",
       "      <td>166.624821</td>\n",
       "      <td>169.231007</td>\n",
       "      <td>-3.190668</td>\n",
       "      <td>-0.216109</td>\n",
       "      <td>-0.027104</td>\n",
       "      <td>36.891806</td>\n",
       "      <td>-0.373806</td>\n",
       "      <td>-1.176719</td>\n",
       "      <td>25.185824</td>\n",
       "      <td>5.183883</td>\n",
       "      <td>0.698841</td>\n",
       "      <td>0.190728</td>\n",
       "      <td>-0.002966</td>\n",
       "      <td>-0.001514</td>\n",
       "      <td>-0.000989</td>\n",
       "      <td>-0.004369</td>\n",
       "      <td>-0.00041</td>\n",
       "      <td>-0.000319</td>\n",
       "      <td>-0.001691</td>\n",
       "      <td>-0.00073</td>\n",
       "      <td>5.824015</td>\n",
       "      <td>338.150543</td>\n",
       "      <td>171.000325</td>\n",
       "      <td>-9.896696</td>\n",
       "      <td>0.772088</td>\n",
       "      <td>-0.382127</td>\n",
       "      <td>-1.527876</td>\n",
       "      <td>-0.124638</td>\n",
       "      <td>0.089372</td>\n",
       "      <td>-0.033513</td>\n",
       "      <td>-0.767407</td>\n",
       "      <td>-0.282165</td>\n",
       "      <td>-0.057123</td>\n",
       "      <td>-0.432119</td>\n",
       "      <td>-0.365658</td>\n",
       "      <td>0.057298</td>\n",
       "      <td>0.008832</td>\n",
       "      <td>-0.427326</td>\n",
       "      <td>-0.308943</td>\n",
       "      <td>-0.200226</td>\n",
       "      <td>0.559307</td>\n",
       "      <td>0.218517</td>\n",
       "      <td>0.445634</td>\n",
       "      <td>0.088007</td>\n",
       "      <td>-0.050025</td>\n",
       "      <td>-0.511898</td>\n",
       "      <td>0.79734</td>\n",
       "      <td>0.317817</td>\n",
       "      <td>1.319525</td>\n",
       "      <td>0.951583</td>\n",
       "      <td>1.069658</td>\n",
       "      <td>0.499987</td>\n",
       "      <td>-2.772049</td>\n",
       "      <td>1.284298</td>\n",
       "      <td>0.013406</td>\n",
       "      <td>-1.062168</td>\n",
       "      <td>2.123109</td>\n",
       "      <td>-0.100165</td>\n",
       "      <td>0.541643</td>\n",
       "      <td>0.028911</td>\n",
       "      <td>-0.657138</td>\n",
       "      <td>1.716911</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1 rows × 355 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                               客户编号       注册资本  是否长期经营  经营成立时间是否相等  注册资金过小  \\\n",
       "0  182d6a854532dd26a1b111e77bd501f4  690521.61       0           1       0   \n",
       "\n",
       "   经营是否已过期  剩余经营天数_天  已经营天数_天  当期经营期限总天数_天  自成立经营期限总天数_天  已成立天数_天  再次经营_天  \\\n",
       "0        0     14574     3675        18249         18249     3675       0   \n",
       "\n",
       "   剩余经营天数_月  已经营天数_月  当期经营期限总天数_月  自成立经营期限总天数_月  已成立天数_月  再次经营_月  剩余经营天数_年  \\\n",
       "0       480      120          600           600      120       0      40.0   \n",
       "\n",
       "   已经营天数_年  当期经营期限总天数_年  自成立经营期限总天数_年  已成立天数_年  再次经营_年  经营状态_编码  \\\n",
       "0     10.0         50.0          50.0     10.0     0.0        1   \n",
       "\n",
       "   企业（机构）类型编码_频数是否前10  企业（机构）类型编码_频数是否后20  企业（机构）类型编码_是否频数最高2类  所在省份编码_频数是否前5  \\\n",
       "0                   1                   0                    1              0   \n",
       "\n",
       "   所在省份编码_频数是否后5  企业（机构）类型编码_是否坏率最高2类  国民经济行业代码_频数是否前5  国民经济行业代码_频数是否后230  \\\n",
       "0              0                    0                0                  0   \n",
       "\n",
       "   企业（机构）类型编码_分箱  所在省份编码_分箱  国民经济行业代码_分箱  法定代表人相关企业个数  法人涉足企业类型  法人涉足国民经济行业代码  \\\n",
       "0              1          3            2            1         1             1   \n",
       "\n",
       "   法人跨省个数  法定代表人_tfidf_0  法定代表人_tfidf_1  法定代表人_tfidf_2  法定代表人_tfidf_3  \\\n",
       "0       1  -1.102424e-33   2.667420e-33   3.992042e-32   8.008220e-31   \n",
       "\n",
       "   法定代表人_tfidf_4  法定代表人_tfidf_5  法定代表人_tfidf_6  法定代表人_tfidf_7  法定代表人_tfidf_8  \\\n",
       "0  -3.629886e-31  -9.217419e-31  -1.847758e-30  -1.470448e-29   6.370142e-30   \n",
       "\n",
       "   法定代表人_tfidf_9  法定代表人_countvec_0  法定代表人_countvec_1  法定代表人_countvec_2  \\\n",
       "0   2.460111e-29     -1.102424e-33      2.667420e-33      3.992042e-32   \n",
       "\n",
       "   法定代表人_countvec_3  法定代表人_countvec_4  法定代表人_countvec_5  法定代表人_countvec_6  \\\n",
       "0      8.008220e-31     -3.629886e-31     -9.217419e-31     -1.847758e-30   \n",
       "\n",
       "   法定代表人_countvec_7  法定代表人_countvec_8  法定代表人_countvec_9  企业（机构）类型编码_tfidf_0  \\\n",
       "0     -1.470448e-29      6.370142e-30      2.460111e-29                 1.0   \n",
       "\n",
       "   企业（机构）类型编码_tfidf_1  企业（机构）类型编码_tfidf_2  企业（机构）类型编码_tfidf_3  \\\n",
       "0       -7.412313e-18       -2.602485e-21       -8.368757e-29   \n",
       "\n",
       "   企业（机构）类型编码_tfidf_4  企业（机构）类型编码_tfidf_5  企业（机构）类型编码_tfidf_6  \\\n",
       "0        9.075137e-33        9.408616e-35       -2.943408e-33   \n",
       "\n",
       "   企业（机构）类型编码_tfidf_7  企业（机构）类型编码_tfidf_8  企业（机构）类型编码_tfidf_9  \\\n",
       "0       -2.544040e-33       -1.943142e-37        9.602446e-37   \n",
       "\n",
       "   企业（机构）类型编码_countvec_0  企业（机构）类型编码_countvec_1  企业（机构）类型编码_countvec_2  \\\n",
       "0                    1.0          -7.412313e-18          -2.602485e-21   \n",
       "\n",
       "   企业（机构）类型编码_countvec_3  企业（机构）类型编码_countvec_4  企业（机构）类型编码_countvec_5  \\\n",
       "0          -8.368757e-29           9.075137e-33           9.408616e-35   \n",
       "\n",
       "   企业（机构）类型编码_countvec_6  企业（机构）类型编码_countvec_7  企业（机构）类型编码_countvec_8  \\\n",
       "0          -2.943408e-33          -2.544040e-33          -1.943142e-37   \n",
       "\n",
       "   企业（机构）类型编码_countvec_9  所在省份编码_tfidf_0  所在省份编码_tfidf_1  所在省份编码_tfidf_2  \\\n",
       "0           9.602446e-37    1.647601e-17    9.703892e-18   -5.928742e-19   \n",
       "\n",
       "   所在省份编码_tfidf_3  所在省份编码_tfidf_4  所在省份编码_tfidf_5  所在省份编码_tfidf_6  \\\n",
       "0    5.131871e-17   -5.831571e-16   -1.076496e-15    1.798985e-14   \n",
       "\n",
       "   所在省份编码_tfidf_7  所在省份编码_tfidf_8  所在省份编码_tfidf_9  所在省份编码_countvec_0  \\\n",
       "0    4.988046e-13             1.0    1.458765e-14       1.647601e-17   \n",
       "\n",
       "   所在省份编码_countvec_1  所在省份编码_countvec_2  所在省份编码_countvec_3  所在省份编码_countvec_4  \\\n",
       "0       9.703892e-18      -5.928742e-19       5.131871e-17      -5.831571e-16   \n",
       "\n",
       "   所在省份编码_countvec_5  所在省份编码_countvec_6  所在省份编码_countvec_7  所在省份编码_countvec_8  \\\n",
       "0      -1.076496e-15       1.798985e-14       4.988046e-13                1.0   \n",
       "\n",
       "   所在省份编码_countvec_9  ...  交易代码_countvec_2  交易代码_countvec_3  交易代码_countvec_4  \\\n",
       "0       1.458765e-14  ...       173.537591       -55.911452        -14.67825   \n",
       "\n",
       "   交易代码_countvec_5  交易代码_countvec_6  交易代码_countvec_7  交易代码_countvec_8  \\\n",
       "0         3.638379        -0.181371       -12.794177       -67.592688   \n",
       "\n",
       "   交易代码_countvec_9  渠道代码_tfidf_0  渠道代码_tfidf_1  渠道代码_tfidf_2  渠道代码_tfidf_3  \\\n",
       "0        -6.488669      0.663855        0.6649     -0.079086     -0.164614   \n",
       "\n",
       "   渠道代码_tfidf_4  渠道代码_tfidf_5  渠道代码_tfidf_6  渠道代码_tfidf_7  渠道代码_tfidf_8  \\\n",
       "0     -0.012042     -0.034964      0.111919      0.016231     -0.041622   \n",
       "\n",
       "   渠道代码_tfidf_9  渠道代码_countvec_0  渠道代码_countvec_1  渠道代码_countvec_2  \\\n",
       "0      0.004708       366.858363      -104.664875       124.010924   \n",
       "\n",
       "   渠道代码_countvec_3  渠道代码_countvec_4  渠道代码_countvec_5  渠道代码_countvec_6  \\\n",
       "0        -86.75876       -22.611876       -66.351759        -2.308754   \n",
       "\n",
       "   渠道代码_countvec_7  渠道代码_countvec_8  渠道代码_countvec_9  摘要信息_tfidf_0  \\\n",
       "0        -2.319975         -6.69274        -1.620721      0.450894   \n",
       "\n",
       "   摘要信息_tfidf_1  摘要信息_tfidf_2  摘要信息_tfidf_3  摘要信息_tfidf_4  摘要信息_tfidf_5  \\\n",
       "0      0.677956     -0.053542     -0.070444      0.056371      0.006496   \n",
       "\n",
       "   摘要信息_tfidf_6  摘要信息_tfidf_7  摘要信息_tfidf_8  摘要信息_tfidf_9  摘要信息_countvec_0  \\\n",
       "0     -0.012164     -0.000835     -0.023478     -0.021109       166.624821   \n",
       "\n",
       "   摘要信息_countvec_1  摘要信息_countvec_2  摘要信息_countvec_3  摘要信息_countvec_4  \\\n",
       "0       169.231007        -3.190668        -0.216109        -0.027104   \n",
       "\n",
       "   摘要信息_countvec_5  摘要信息_countvec_6  摘要信息_countvec_7  摘要信息_countvec_8  \\\n",
       "0        36.891806        -0.373806        -1.176719        25.185824   \n",
       "\n",
       "   摘要信息_countvec_9  交易对手客户编号_tfidf_0  交易对手客户编号_tfidf_1  交易对手客户编号_tfidf_2  \\\n",
       "0         5.183883          0.698841          0.190728         -0.002966   \n",
       "\n",
       "   交易对手客户编号_tfidf_3  交易对手客户编号_tfidf_4  交易对手客户编号_tfidf_5  交易对手客户编号_tfidf_6  \\\n",
       "0         -0.001514         -0.000989         -0.004369          -0.00041   \n",
       "\n",
       "   交易对手客户编号_tfidf_7  交易对手客户编号_tfidf_8  交易对手客户编号_tfidf_9  交易对手客户编号_countvec_0  \\\n",
       "0         -0.000319         -0.001691          -0.00073             5.824015   \n",
       "\n",
       "   交易对手客户编号_countvec_1  交易对手客户编号_countvec_2  交易对手客户编号_countvec_3  \\\n",
       "0           338.150543           171.000325            -9.896696   \n",
       "\n",
       "   交易对手客户编号_countvec_4  交易对手客户编号_countvec_5  交易对手客户编号_countvec_6  \\\n",
       "0             0.772088            -0.382127            -1.527876   \n",
       "\n",
       "   交易对手客户编号_countvec_7  交易对手客户编号_countvec_8  交易对手客户编号_countvec_9  \\\n",
       "0            -0.124638             0.089372            -0.033513   \n",
       "\n",
       "   客户编号_交易代码_w2v_0  客户编号_交易代码_w2v_1  客户编号_交易代码_w2v_2  客户编号_交易代码_w2v_3  \\\n",
       "0        -0.767407        -0.282165        -0.057123        -0.432119   \n",
       "\n",
       "   客户编号_交易代码_w2v_4  客户编号_交易代码_w2v_5  客户编号_交易代码_w2v_6  客户编号_交易代码_w2v_7  \\\n",
       "0        -0.365658         0.057298         0.008832        -0.427326   \n",
       "\n",
       "   客户编号_渠道代码_w2v_0  客户编号_渠道代码_w2v_1  客户编号_渠道代码_w2v_2  客户编号_渠道代码_w2v_3  \\\n",
       "0        -0.308943        -0.200226         0.559307         0.218517   \n",
       "\n",
       "   客户编号_渠道代码_w2v_4  客户编号_渠道代码_w2v_5  客户编号_渠道代码_w2v_6  客户编号_渠道代码_w2v_7  \\\n",
       "0         0.445634         0.088007        -0.050025        -0.511898   \n",
       "\n",
       "   客户编号_摘要信息_w2v_0  客户编号_摘要信息_w2v_1  客户编号_摘要信息_w2v_2  客户编号_摘要信息_w2v_3  \\\n",
       "0          0.79734         0.317817         1.319525         0.951583   \n",
       "\n",
       "   客户编号_摘要信息_w2v_4  客户编号_摘要信息_w2v_5  客户编号_摘要信息_w2v_6  客户编号_摘要信息_w2v_7  \\\n",
       "0         1.069658         0.499987        -2.772049         1.284298   \n",
       "\n",
       "   客户编号_交易对手客户编号_w2v_0  客户编号_交易对手客户编号_w2v_1  客户编号_交易对手客户编号_w2v_2  \\\n",
       "0             0.013406            -1.062168             2.123109   \n",
       "\n",
       "   客户编号_交易对手客户编号_w2v_3  客户编号_交易对手客户编号_w2v_4  客户编号_交易对手客户编号_w2v_5  \\\n",
       "0            -0.100165             0.541643             0.028911   \n",
       "\n",
       "   客户编号_交易对手客户编号_w2v_6  客户编号_交易对手客户编号_w2v_7  \n",
       "0            -0.657138             1.716911  \n",
       "\n",
       "[1 rows x 355 columns]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature_hyy = basic_info.merge(basic_text, how = 'left', on = '客户编号')\n",
    "feature_hyy = feature_hyy.merge(fncl_tr_dtal_info_all, how = 'left', on = '客户编号')\n",
    "feature_hyy = feature_hyy.merge(fncl_tr_dtal_text, how = 'left', on = '客户编号')\n",
    "print(feature_hyy.shape)\n",
    "feature_hyy.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "f8f33cc6-690a-491f-8b75-e53e48b9fa2d",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-11T03:19:36.141224Z",
     "iopub.status.busy": "2024-11-11T03:19:36.140876Z",
     "iopub.status.idle": "2024-11-11T03:19:36.426240Z",
     "msg_id": "f0be2864-2564-421c-a13c-aff014b868d8",
     "shell.execute_reply": "2024-11-11T03:19:36.425542Z",
     "shell.execute_reply.started": "2024-11-11T03:19:36.141197Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(59079, 355)\n"
     ]
    }
   ],
   "source": [
    "print(feature_hyy.shape)\n",
    "feature_hyy.to_pickle(\"../data/hyy特征_A榜.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "ff3dfa71-062c-45d5-94ec-831457f1a5c5",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-11T03:19:36.427563Z",
     "iopub.status.busy": "2024-11-11T03:19:36.427228Z",
     "iopub.status.idle": "2024-11-11T03:19:36.501473Z",
     "msg_id": "662d7f4c-72a7-4ec0-8490-2eab2f1dc01b",
     "shell.execute_reply": "2024-11-11T03:19:36.500843Z",
     "shell.execute_reply.started": "2024-11-11T03:19:36.427535Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(59079, 1)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>客户编号</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>158a8d99bec2a2b652a6de45a2b52ec9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>b1d244a25a82adb7beafe33fe971402c</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>85b1ab1270516d2ebe21ed00c6abbf27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ef194610bdbecdea9af3cc23bceba8b2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1220f9592fdd0b3fa9bbbd90e6d69d84</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               客户编号\n",
       "0  158a8d99bec2a2b652a6de45a2b52ec9\n",
       "1  b1d244a25a82adb7beafe33fe971402c\n",
       "2  85b1ab1270516d2ebe21ed00c6abbf27\n",
       "3  ef194610bdbecdea9af3cc23bceba8b2\n",
       "4  1220f9592fdd0b3fa9bbbd90e6d69d84"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "file_name = 'XW_ENTINFO_TARGET'\n",
    "TARGET = get_data(file_name, num_rows=None)\n",
    "TARGET = TARGET.drop(['数据日期', 'FLAG', 'is_train'], axis = 1)\n",
    "print(TARGET.shape)\n",
    "TARGET.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "a3ca6af2-b8a0-450a-99f8-6a1df2cada81",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-11T03:19:36.502870Z",
     "iopub.status.busy": "2024-11-11T03:19:36.502488Z",
     "iopub.status.idle": "2024-11-11T03:19:36.790763Z",
     "msg_id": "1bf99c0e-2eaf-43ce-af6f-675a051dd65d",
     "shell.execute_reply": "2024-11-11T03:19:36.790119Z",
     "shell.execute_reply.started": "2024-11-11T03:19:36.502842Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>客户编号</th>\n",
       "      <th>客户编号_所在省份编码_w2v_1</th>\n",
       "      <th>所在省份编码_tfidf_0</th>\n",
       "      <th>客户编号_所在省份编码_w2v_3</th>\n",
       "      <th>客户编号_国民经济行业代码_w2v_5</th>\n",
       "      <th>客户编号_国民经济行业代码_w2v_4</th>\n",
       "      <th>客户编号_所在省份编码_w2v_2</th>\n",
       "      <th>客户编号_所在省份编码_w2v_6</th>\n",
       "      <th>所在省份编码_tfidf_9</th>\n",
       "      <th>企业（机构）类型编码_tfidf_0</th>\n",
       "      <th>客户编号_企业（机构）类型编码_w2v_1</th>\n",
       "      <th>企业（机构）类型编码_tfidf_1</th>\n",
       "      <th>客户编号_所在省份编码_w2v_0</th>\n",
       "      <th>注册资本</th>\n",
       "      <th>是否长期经营</th>\n",
       "      <th>经营成立时间是否相等</th>\n",
       "      <th>注册资金过小</th>\n",
       "      <th>经营是否已过期</th>\n",
       "      <th>剩余经营天数_天</th>\n",
       "      <th>已经营天数_天</th>\n",
       "      <th>当期经营期限总天数_天</th>\n",
       "      <th>自成立经营期限总天数_天</th>\n",
       "      <th>已成立天数_天</th>\n",
       "      <th>再次经营_天</th>\n",
       "      <th>剩余经营天数_月</th>\n",
       "      <th>已经营天数_月</th>\n",
       "      <th>当期经营期限总天数_月</th>\n",
       "      <th>自成立经营期限总天数_月</th>\n",
       "      <th>已成立天数_月</th>\n",
       "      <th>再次经营_月</th>\n",
       "      <th>剩余经营天数_年</th>\n",
       "      <th>已经营天数_年</th>\n",
       "      <th>当期经营期限总天数_年</th>\n",
       "      <th>自成立经营期限总天数_年</th>\n",
       "      <th>已成立天数_年</th>\n",
       "      <th>再次经营_年</th>\n",
       "      <th>经营状态_编码</th>\n",
       "      <th>企业（机构）类型编码_频数是否前10</th>\n",
       "      <th>企业（机构）类型编码_频数是否后20</th>\n",
       "      <th>企业（机构）类型编码_是否频数最高2类</th>\n",
       "      <th>所在省份编码_频数是否前5</th>\n",
       "      <th>所在省份编码_频数是否后5</th>\n",
       "      <th>企业（机构）类型编码_是否坏率最高2类</th>\n",
       "      <th>国民经济行业代码_频数是否前5</th>\n",
       "      <th>国民经济行业代码_频数是否后230</th>\n",
       "      <th>企业（机构）类型编码_分箱</th>\n",
       "      <th>所在省份编码_分箱</th>\n",
       "      <th>国民经济行业代码_分箱</th>\n",
       "      <th>法定代表人相关企业个数</th>\n",
       "      <th>法人涉足企业类型</th>\n",
       "      <th>法人涉足国民经济行业代码</th>\n",
       "      <th>法人跨省个数</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>182d6a854532dd26a1b111e77bd501f4</td>\n",
       "      <td>-0.043544</td>\n",
       "      <td>1.647601e-17</td>\n",
       "      <td>-0.032385</td>\n",
       "      <td>-0.104712</td>\n",
       "      <td>0.065054</td>\n",
       "      <td>0.101384</td>\n",
       "      <td>0.07392</td>\n",
       "      <td>1.458765e-14</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.068489</td>\n",
       "      <td>-7.412313e-18</td>\n",
       "      <td>-0.022287</td>\n",
       "      <td>690521.61</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>14574</td>\n",
       "      <td>3675</td>\n",
       "      <td>18249</td>\n",
       "      <td>18249</td>\n",
       "      <td>3675</td>\n",
       "      <td>0</td>\n",
       "      <td>480</td>\n",
       "      <td>120</td>\n",
       "      <td>600</td>\n",
       "      <td>600</td>\n",
       "      <td>120</td>\n",
       "      <td>0</td>\n",
       "      <td>40.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>50.0</td>\n",
       "      <td>50.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               客户编号  客户编号_所在省份编码_w2v_1  所在省份编码_tfidf_0  \\\n",
       "0  182d6a854532dd26a1b111e77bd501f4          -0.043544    1.647601e-17   \n",
       "\n",
       "   客户编号_所在省份编码_w2v_3  客户编号_国民经济行业代码_w2v_5  客户编号_国民经济行业代码_w2v_4  \\\n",
       "0          -0.032385            -0.104712             0.065054   \n",
       "\n",
       "   客户编号_所在省份编码_w2v_2  客户编号_所在省份编码_w2v_6  所在省份编码_tfidf_9  企业（机构）类型编码_tfidf_0  \\\n",
       "0           0.101384            0.07392    1.458765e-14                 1.0   \n",
       "\n",
       "   客户编号_企业（机构）类型编码_w2v_1  企业（机构）类型编码_tfidf_1  客户编号_所在省份编码_w2v_0       注册资本  \\\n",
       "0               0.068489       -7.412313e-18          -0.022287  690521.61   \n",
       "\n",
       "   是否长期经营  经营成立时间是否相等  注册资金过小  经营是否已过期  剩余经营天数_天  已经营天数_天  当期经营期限总天数_天  \\\n",
       "0       0           1       0        0     14574     3675        18249   \n",
       "\n",
       "   自成立经营期限总天数_天  已成立天数_天  再次经营_天  剩余经营天数_月  已经营天数_月  当期经营期限总天数_月  \\\n",
       "0         18249     3675       0       480      120          600   \n",
       "\n",
       "   自成立经营期限总天数_月  已成立天数_月  再次经营_月  剩余经营天数_年  已经营天数_年  当期经营期限总天数_年  \\\n",
       "0           600      120       0      40.0     10.0         50.0   \n",
       "\n",
       "   自成立经营期限总天数_年  已成立天数_年  再次经营_年  经营状态_编码  企业（机构）类型编码_频数是否前10  \\\n",
       "0          50.0     10.0     0.0        1                   1   \n",
       "\n",
       "   企业（机构）类型编码_频数是否后20  企业（机构）类型编码_是否频数最高2类  所在省份编码_频数是否前5  所在省份编码_频数是否后5  \\\n",
       "0                   0                    1              0              0   \n",
       "\n",
       "   企业（机构）类型编码_是否坏率最高2类  国民经济行业代码_频数是否前5  国民经济行业代码_频数是否后230  企业（机构）类型编码_分箱  \\\n",
       "0                    0                0                  0              1   \n",
       "\n",
       "   所在省份编码_分箱  国民经济行业代码_分箱  法定代表人相关企业个数  法人涉足企业类型  法人涉足国民经济行业代码  法人跨省个数  \n",
       "0          3            2            1         1             1       1  "
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "AA_basic = feature_hyy[['客户编号', '客户编号_所在省份编码_w2v_1', '所在省份编码_tfidf_0', '客户编号_所在省份编码_w2v_3',\n",
    "       '客户编号_国民经济行业代码_w2v_5', '客户编号_国民经济行业代码_w2v_4', '客户编号_所在省份编码_w2v_2',\n",
    "       '客户编号_所在省份编码_w2v_6', '所在省份编码_tfidf_9', '企业（机构）类型编码_tfidf_0',\n",
    "       '客户编号_企业（机构）类型编码_w2v_1', '企业（机构）类型编码_tfidf_1', '客户编号_所在省份编码_w2v_0',\n",
    "       '注册资本', '是否长期经营', '经营成立时间是否相等', '注册资金过小', '经营是否已过期', '剩余经营天数_天',\n",
    "       '已经营天数_天', '当期经营期限总天数_天', '自成立经营期限总天数_天', '已成立天数_天', '再次经营_天',\n",
    "       '剩余经营天数_月', '已经营天数_月', '当期经营期限总天数_月', '自成立经营期限总天数_月', '已成立天数_月',\n",
    "       '再次经营_月', '剩余经营天数_年', '已经营天数_年', '当期经营期限总天数_年', '自成立经营期限总天数_年',\n",
    "       '已成立天数_年', '再次经营_年', '经营状态_编码', '企业（机构）类型编码_频数是否前10',\n",
    "       '企业（机构）类型编码_频数是否后20', '企业（机构）类型编码_是否频数最高2类', '所在省份编码_频数是否前5',\n",
    "       '所在省份编码_频数是否后5', '企业（机构）类型编码_是否坏率最高2类', '国民经济行业代码_频数是否前5',\n",
    "       '国民经济行业代码_频数是否后230', '企业（机构）类型编码_分箱', '所在省份编码_分箱', '国民经济行业代码_分箱',\n",
    "       '法定代表人相关企业个数', '法人涉足企业类型', '法人涉足国民经济行业代码', '法人跨省个数']]\n",
    "AA_basic.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "5415b86f-36b9-4b43-beb3-6f22b50a0f79",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-11T03:19:36.792053Z",
     "iopub.status.busy": "2024-11-11T03:19:36.791711Z",
     "iopub.status.idle": "2024-11-11T03:19:36.906913Z",
     "msg_id": "b890a76c-355a-43d1-92ff-91b6412ff8fa",
     "shell.execute_reply": "2024-11-11T03:19:36.906247Z",
     "shell.execute_reply.started": "2024-11-11T03:19:36.792025Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>客户编号</th>\n",
       "      <th>客户编号_所在省份编码_w2v_1</th>\n",
       "      <th>所在省份编码_tfidf_0</th>\n",
       "      <th>客户编号_所在省份编码_w2v_3</th>\n",
       "      <th>客户编号_国民经济行业代码_w2v_5</th>\n",
       "      <th>客户编号_国民经济行业代码_w2v_4</th>\n",
       "      <th>客户编号_所在省份编码_w2v_2</th>\n",
       "      <th>客户编号_所在省份编码_w2v_6</th>\n",
       "      <th>所在省份编码_tfidf_9</th>\n",
       "      <th>企业（机构）类型编码_tfidf_0</th>\n",
       "      <th>客户编号_企业（机构）类型编码_w2v_1</th>\n",
       "      <th>企业（机构）类型编码_tfidf_1</th>\n",
       "      <th>客户编号_所在省份编码_w2v_0</th>\n",
       "      <th>注册资本</th>\n",
       "      <th>是否长期经营</th>\n",
       "      <th>经营成立时间是否相等</th>\n",
       "      <th>注册资金过小</th>\n",
       "      <th>经营是否已过期</th>\n",
       "      <th>剩余经营天数_天</th>\n",
       "      <th>已经营天数_天</th>\n",
       "      <th>当期经营期限总天数_天</th>\n",
       "      <th>自成立经营期限总天数_天</th>\n",
       "      <th>已成立天数_天</th>\n",
       "      <th>再次经营_天</th>\n",
       "      <th>剩余经营天数_月</th>\n",
       "      <th>已经营天数_月</th>\n",
       "      <th>当期经营期限总天数_月</th>\n",
       "      <th>自成立经营期限总天数_月</th>\n",
       "      <th>已成立天数_月</th>\n",
       "      <th>再次经营_月</th>\n",
       "      <th>剩余经营天数_年</th>\n",
       "      <th>已经营天数_年</th>\n",
       "      <th>当期经营期限总天数_年</th>\n",
       "      <th>自成立经营期限总天数_年</th>\n",
       "      <th>已成立天数_年</th>\n",
       "      <th>再次经营_年</th>\n",
       "      <th>经营状态_编码</th>\n",
       "      <th>企业（机构）类型编码_频数是否前10</th>\n",
       "      <th>企业（机构）类型编码_频数是否后20</th>\n",
       "      <th>企业（机构）类型编码_是否频数最高2类</th>\n",
       "      <th>所在省份编码_频数是否前5</th>\n",
       "      <th>所在省份编码_频数是否后5</th>\n",
       "      <th>企业（机构）类型编码_是否坏率最高2类</th>\n",
       "      <th>国民经济行业代码_频数是否前5</th>\n",
       "      <th>国民经济行业代码_频数是否后230</th>\n",
       "      <th>企业（机构）类型编码_分箱</th>\n",
       "      <th>所在省份编码_分箱</th>\n",
       "      <th>国民经济行业代码_分箱</th>\n",
       "      <th>法定代表人相关企业个数</th>\n",
       "      <th>法人涉足企业类型</th>\n",
       "      <th>法人涉足国民经济行业代码</th>\n",
       "      <th>法人跨省个数</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>158a8d99bec2a2b652a6de45a2b52ec9</td>\n",
       "      <td>-0.015712</td>\n",
       "      <td>-1.340637e-19</td>\n",
       "      <td>0.083170</td>\n",
       "      <td>0.037983</td>\n",
       "      <td>-0.096568</td>\n",
       "      <td>-0.084828</td>\n",
       "      <td>-0.100693</td>\n",
       "      <td>2.174522e-13</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>0.068489</td>\n",
       "      <td>-7.412313e-18</td>\n",
       "      <td>0.110077</td>\n",
       "      <td>690521.61</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5655</td>\n",
       "      <td>1644</td>\n",
       "      <td>7299</td>\n",
       "      <td>7299</td>\n",
       "      <td>1644</td>\n",
       "      <td>0</td>\n",
       "      <td>186</td>\n",
       "      <td>54</td>\n",
       "      <td>240</td>\n",
       "      <td>240</td>\n",
       "      <td>54</td>\n",
       "      <td>0</td>\n",
       "      <td>15.500000</td>\n",
       "      <td>4.500000</td>\n",
       "      <td>20.000000</td>\n",
       "      <td>20.000000</td>\n",
       "      <td>4.500000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>b1d244a25a82adb7beafe33fe971402c</td>\n",
       "      <td>0.035966</td>\n",
       "      <td>-1.161965e-21</td>\n",
       "      <td>0.080690</td>\n",
       "      <td>-0.034836</td>\n",
       "      <td>0.006594</td>\n",
       "      <td>-0.024396</td>\n",
       "      <td>-0.012385</td>\n",
       "      <td>-2.082196e-18</td>\n",
       "      <td>7.410764e-18</td>\n",
       "      <td>-0.101456</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>0.070392</td>\n",
       "      <td>345266.51</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>364059</td>\n",
       "      <td>1748</td>\n",
       "      <td>365807</td>\n",
       "      <td>365807</td>\n",
       "      <td>1748</td>\n",
       "      <td>0</td>\n",
       "      <td>11969</td>\n",
       "      <td>57</td>\n",
       "      <td>12026</td>\n",
       "      <td>12026</td>\n",
       "      <td>57</td>\n",
       "      <td>0</td>\n",
       "      <td>997.416667</td>\n",
       "      <td>4.750000</td>\n",
       "      <td>1002.166667</td>\n",
       "      <td>1002.166667</td>\n",
       "      <td>4.750000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>85b1ab1270516d2ebe21ed00c6abbf27</td>\n",
       "      <td>-0.015712</td>\n",
       "      <td>-1.340637e-19</td>\n",
       "      <td>0.083170</td>\n",
       "      <td>-0.110424</td>\n",
       "      <td>0.080419</td>\n",
       "      <td>-0.084828</td>\n",
       "      <td>-0.100693</td>\n",
       "      <td>2.174522e-13</td>\n",
       "      <td>7.410764e-18</td>\n",
       "      <td>-0.101456</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>0.110077</td>\n",
       "      <td>690521.61</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9818</td>\n",
       "      <td>1130</td>\n",
       "      <td>10948</td>\n",
       "      <td>10948</td>\n",
       "      <td>1130</td>\n",
       "      <td>0</td>\n",
       "      <td>323</td>\n",
       "      <td>37</td>\n",
       "      <td>360</td>\n",
       "      <td>360</td>\n",
       "      <td>37</td>\n",
       "      <td>0</td>\n",
       "      <td>26.916667</td>\n",
       "      <td>3.083333</td>\n",
       "      <td>30.000000</td>\n",
       "      <td>30.000000</td>\n",
       "      <td>3.083333</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ef194610bdbecdea9af3cc23bceba8b2</td>\n",
       "      <td>-0.101456</td>\n",
       "      <td>-1.259253e-12</td>\n",
       "      <td>0.118906</td>\n",
       "      <td>-0.083848</td>\n",
       "      <td>-0.079825</td>\n",
       "      <td>0.006620</td>\n",
       "      <td>0.054369</td>\n",
       "      <td>3.350545e-19</td>\n",
       "      <td>2.943408e-33</td>\n",
       "      <td>0.069596</td>\n",
       "      <td>-2.286179e-28</td>\n",
       "      <td>-0.074633</td>\n",
       "      <td>1312010.26</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>364059</td>\n",
       "      <td>4140</td>\n",
       "      <td>368199</td>\n",
       "      <td>368199</td>\n",
       "      <td>4140</td>\n",
       "      <td>0</td>\n",
       "      <td>11969</td>\n",
       "      <td>136</td>\n",
       "      <td>12105</td>\n",
       "      <td>12105</td>\n",
       "      <td>136</td>\n",
       "      <td>0</td>\n",
       "      <td>997.416667</td>\n",
       "      <td>11.333333</td>\n",
       "      <td>1008.750000</td>\n",
       "      <td>1008.750000</td>\n",
       "      <td>11.333333</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1220f9592fdd0b3fa9bbbd90e6d69d84</td>\n",
       "      <td>0.064522</td>\n",
       "      <td>-2.014757e-19</td>\n",
       "      <td>-0.036369</td>\n",
       "      <td>-0.049622</td>\n",
       "      <td>-0.065308</td>\n",
       "      <td>0.050131</td>\n",
       "      <td>-0.013578</td>\n",
       "      <td>-1.165212e-16</td>\n",
       "      <td>7.410764e-18</td>\n",
       "      <td>-0.101456</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>-0.083693</td>\n",
       "      <td>6904.22</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>364059</td>\n",
       "      <td>1203</td>\n",
       "      <td>365262</td>\n",
       "      <td>365262</td>\n",
       "      <td>1203</td>\n",
       "      <td>0</td>\n",
       "      <td>11969</td>\n",
       "      <td>39</td>\n",
       "      <td>12008</td>\n",
       "      <td>12008</td>\n",
       "      <td>39</td>\n",
       "      <td>0</td>\n",
       "      <td>997.416667</td>\n",
       "      <td>3.250000</td>\n",
       "      <td>1000.666667</td>\n",
       "      <td>1000.666667</td>\n",
       "      <td>3.250000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               客户编号  客户编号_所在省份编码_w2v_1  所在省份编码_tfidf_0  \\\n",
       "0  158a8d99bec2a2b652a6de45a2b52ec9          -0.015712   -1.340637e-19   \n",
       "1  b1d244a25a82adb7beafe33fe971402c           0.035966   -1.161965e-21   \n",
       "2  85b1ab1270516d2ebe21ed00c6abbf27          -0.015712   -1.340637e-19   \n",
       "3  ef194610bdbecdea9af3cc23bceba8b2          -0.101456   -1.259253e-12   \n",
       "4  1220f9592fdd0b3fa9bbbd90e6d69d84           0.064522   -2.014757e-19   \n",
       "\n",
       "   客户编号_所在省份编码_w2v_3  客户编号_国民经济行业代码_w2v_5  客户编号_国民经济行业代码_w2v_4  \\\n",
       "0           0.083170             0.037983            -0.096568   \n",
       "1           0.080690            -0.034836             0.006594   \n",
       "2           0.083170            -0.110424             0.080419   \n",
       "3           0.118906            -0.083848            -0.079825   \n",
       "4          -0.036369            -0.049622            -0.065308   \n",
       "\n",
       "   客户编号_所在省份编码_w2v_2  客户编号_所在省份编码_w2v_6  所在省份编码_tfidf_9  企业（机构）类型编码_tfidf_0  \\\n",
       "0          -0.084828          -0.100693    2.174522e-13        1.000000e+00   \n",
       "1          -0.024396          -0.012385   -2.082196e-18        7.410764e-18   \n",
       "2          -0.084828          -0.100693    2.174522e-13        7.410764e-18   \n",
       "3           0.006620           0.054369    3.350545e-19        2.943408e-33   \n",
       "4           0.050131          -0.013578   -1.165212e-16        7.410764e-18   \n",
       "\n",
       "   客户编号_企业（机构）类型编码_w2v_1  企业（机构）类型编码_tfidf_1  客户编号_所在省份编码_w2v_0        注册资本  \\\n",
       "0               0.068489       -7.412313e-18           0.110077   690521.61   \n",
       "1              -0.101456        1.000000e+00           0.070392   345266.51   \n",
       "2              -0.101456        1.000000e+00           0.110077   690521.61   \n",
       "3               0.069596       -2.286179e-28          -0.074633  1312010.26   \n",
       "4              -0.101456        1.000000e+00          -0.083693     6904.22   \n",
       "\n",
       "   是否长期经营  经营成立时间是否相等  注册资金过小  经营是否已过期  剩余经营天数_天  已经营天数_天  当期经营期限总天数_天  \\\n",
       "0       0           1       0        0      5655     1644         7299   \n",
       "1       1           1       0        0    364059     1748       365807   \n",
       "2       0           1       0        0      9818     1130        10948   \n",
       "3       1           1       0        0    364059     4140       368199   \n",
       "4       1           1       0        0    364059     1203       365262   \n",
       "\n",
       "   自成立经营期限总天数_天  已成立天数_天  再次经营_天  剩余经营天数_月  已经营天数_月  当期经营期限总天数_月  \\\n",
       "0          7299     1644       0       186       54          240   \n",
       "1        365807     1748       0     11969       57        12026   \n",
       "2         10948     1130       0       323       37          360   \n",
       "3        368199     4140       0     11969      136        12105   \n",
       "4        365262     1203       0     11969       39        12008   \n",
       "\n",
       "   自成立经营期限总天数_月  已成立天数_月  再次经营_月    剩余经营天数_年    已经营天数_年  当期经营期限总天数_年  \\\n",
       "0           240       54       0   15.500000   4.500000    20.000000   \n",
       "1         12026       57       0  997.416667   4.750000  1002.166667   \n",
       "2           360       37       0   26.916667   3.083333    30.000000   \n",
       "3         12105      136       0  997.416667  11.333333  1008.750000   \n",
       "4         12008       39       0  997.416667   3.250000  1000.666667   \n",
       "\n",
       "   自成立经营期限总天数_年    已成立天数_年  再次经营_年  经营状态_编码  企业（机构）类型编码_频数是否前10  \\\n",
       "0     20.000000   4.500000     0.0        1                   1   \n",
       "1   1002.166667   4.750000     0.0        1                   1   \n",
       "2     30.000000   3.083333     0.0        1                   1   \n",
       "3   1008.750000  11.333333     0.0        1                   1   \n",
       "4   1000.666667   3.250000     0.0        1                   1   \n",
       "\n",
       "   企业（机构）类型编码_频数是否后20  企业（机构）类型编码_是否频数最高2类  所在省份编码_频数是否前5  所在省份编码_频数是否后5  \\\n",
       "0                   0                    1              0              0   \n",
       "1                   0                    1              1              0   \n",
       "2                   0                    1              0              0   \n",
       "3                   0                    0              1              0   \n",
       "4                   0                    1              0              0   \n",
       "\n",
       "   企业（机构）类型编码_是否坏率最高2类  国民经济行业代码_频数是否前5  国民经济行业代码_频数是否后230  企业（机构）类型编码_分箱  \\\n",
       "0                    0                0                  0              1   \n",
       "1                    0                0                  0              3   \n",
       "2                    0                0                  0              3   \n",
       "3                    0                0                  0              0   \n",
       "4                    0                0                  0              3   \n",
       "\n",
       "   所在省份编码_分箱  国民经济行业代码_分箱  法定代表人相关企业个数  法人涉足企业类型  法人涉足国民经济行业代码  法人跨省个数  \n",
       "0          2            4            1         1             1       1  \n",
       "1          3            3            1         1             1       1  \n",
       "2          2            3            1         1             1       1  \n",
       "3          0            1            1         1             1       1  \n",
       "4          5            3            1         1             1       1  "
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "TARGET = TARGET.merge(AA_basic, on = '客户编号', how = 'left')\n",
    "TARGET.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "f37f7c7a-3a37-490f-bc81-b8ff37cc2034",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-11T03:19:36.908369Z",
     "iopub.status.busy": "2024-11-11T03:19:36.907868Z",
     "iopub.status.idle": "2024-11-11T03:19:36.969107Z",
     "msg_id": "5d52ee30-a7a7-4db0-b57c-59a1994cb637",
     "shell.execute_reply": "2024-11-11T03:19:36.968445Z",
     "shell.execute_reply.started": "2024-11-11T03:19:36.908342Z"
    }
   },
   "outputs": [],
   "source": [
    "TARGET.to_pickle(\"../data/基本信息表_原本数据加文本特征_A榜.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "ac6a5f9e-da84-4b72-b98b-39c3f69e8443",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-11T03:20:31.896574Z",
     "iopub.status.busy": "2024-11-11T03:20:31.896072Z",
     "iopub.status.idle": "2024-11-11T03:20:32.000876Z",
     "msg_id": "e7ba8172-ced6-42b1-935c-f8180ff1ec85",
     "shell.execute_reply": "2024-11-11T03:20:32.000195Z",
     "shell.execute_reply.started": "2024-11-11T03:20:31.896541Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# df = pd.read_pickle(\"../data/基本信息表_原本数据加文本特征_A榜.pkl\").drop(columns=['企业（机构）类型编码_分箱', '所在省份编码_分箱', '国民经济行业代码_分箱'])\n",
    "# o_df = pd.read_pickle(\"/home/mole/work/heyuyang/1030/data/基本信息表_原本数据加文本特征_14点.pkl\").drop(columns=['企业（机构）类型编码_分箱', '所在省份编码_分箱', '国民经济行业代码_分箱'])\n",
    "# df.equals(o_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c3a2f1d8-62ac-4341-9a72-e1f73f464764",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6fd3a1fe-b212-4d1c-b11c-3252ac318160",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "ecf3f95f-9caf-435b-8ca0-a85898ac9216",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-11T03:19:36.970506Z",
     "iopub.status.busy": "2024-11-11T03:19:36.970114Z",
     "iopub.status.idle": "2024-11-11T03:19:37.044830Z",
     "msg_id": "ffa05ed0-2e90-4c0b-9f73-a4d228cfcfb0",
     "shell.execute_reply": "2024-11-11T03:19:37.044242Z",
     "shell.execute_reply.started": "2024-11-11T03:19:36.970479Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>客户编号</th>\n",
       "      <th>总交易金额占比_mean</th>\n",
       "      <th>总交易金额占比_max</th>\n",
       "      <th>总交易金额占比_min</th>\n",
       "      <th>转入金额占比_mean</th>\n",
       "      <th>转入金额占比_max</th>\n",
       "      <th>转入金额占比_min</th>\n",
       "      <th>转出金额占比_mean</th>\n",
       "      <th>转出金额占比_max</th>\n",
       "      <th>转出金额占比_min</th>\n",
       "      <th>本人金额占比_mean</th>\n",
       "      <th>本人金额占比_max</th>\n",
       "      <th>本人金额占比_min</th>\n",
       "      <th>非本人金额占比_mean</th>\n",
       "      <th>非本人金额占比_max</th>\n",
       "      <th>非本人金额占比_min</th>\n",
       "      <th>交易次数小于等于5</th>\n",
       "      <th>总流出金额</th>\n",
       "      <th>总流出笔数</th>\n",
       "      <th>流出平均金额</th>\n",
       "      <th>流出金额方差</th>\n",
       "      <th>流出金额最大值</th>\n",
       "      <th>总流入金额</th>\n",
       "      <th>总流入笔数</th>\n",
       "      <th>流入平均金额</th>\n",
       "      <th>流入金额方差</th>\n",
       "      <th>流入金额最大值</th>\n",
       "      <th>总净流</th>\n",
       "      <th>总金额</th>\n",
       "      <th>总笔数</th>\n",
       "      <th>近一月流出金额</th>\n",
       "      <th>近一月流出笔数</th>\n",
       "      <th>近一月流入金额</th>\n",
       "      <th>近一月流入笔数</th>\n",
       "      <th>近一月总净流</th>\n",
       "      <th>近一月总金额</th>\n",
       "      <th>近一月总笔数</th>\n",
       "      <th>倒数第三月流出金额</th>\n",
       "      <th>倒数第三月流出笔数</th>\n",
       "      <th>倒数第三月流入金额</th>\n",
       "      <th>倒数第三月流入笔数</th>\n",
       "      <th>倒数第三月总净流</th>\n",
       "      <th>倒数第三月总金额</th>\n",
       "      <th>倒数第三月总笔数</th>\n",
       "      <th>第三个月与第一个月流入金额差</th>\n",
       "      <th>第三个月与第一个月流出金额差</th>\n",
       "      <th>第三个月与第一个月总金额差</th>\n",
       "      <th>第三个月与第一个月流出笔数差</th>\n",
       "      <th>第三个月与第一个月流入笔数差</th>\n",
       "      <th>第三个月与第一个月总笔数差</th>\n",
       "      <th>相关客户数</th>\n",
       "      <th>最后交易日流出金额</th>\n",
       "      <th>最后交易日流出笔数</th>\n",
       "      <th>最后交易日流入金额</th>\n",
       "      <th>最后交易日流入笔数</th>\n",
       "      <th>最后交易日总净流</th>\n",
       "      <th>最后交易日总金额</th>\n",
       "      <th>最后交易日总笔数</th>\n",
       "      <th>非工作日交易金额</th>\n",
       "      <th>非工作日交易笔数</th>\n",
       "      <th>企业交易绝对值最高金额</th>\n",
       "      <th>企业交易绝对值最低金额</th>\n",
       "      <th>企业交易绝对值_mean</th>\n",
       "      <th>企业交易绝对值_std</th>\n",
       "      <th>近一月平均账户余额</th>\n",
       "      <th>近一月最大账户余额</th>\n",
       "      <th>近一月账户余额方差</th>\n",
       "      <th>倒数第三个月平均账户余额</th>\n",
       "      <th>倒数第三个月最大账户余额</th>\n",
       "      <th>倒数第三个月账户余额方差</th>\n",
       "      <th>第三个月与第一个月余额均值差</th>\n",
       "      <th>第三个月与第一个月余额最大差</th>\n",
       "      <th>近一月交易代码个数</th>\n",
       "      <th>近一月渠道代码个数</th>\n",
       "      <th>倒数第三月交易代码个数</th>\n",
       "      <th>倒数第三月渠道代码个数</th>\n",
       "      <th>第三个月与第一个月渠道数差</th>\n",
       "      <th>第三个月与第一个月交易代码数差</th>\n",
       "      <th>交易代码_tfidf_8</th>\n",
       "      <th>摘要信息_countvec_3</th>\n",
       "      <th>客户编号_摘要信息_w2v_0</th>\n",
       "      <th>客户编号_交易代码_w2v_2</th>\n",
       "      <th>客户编号_交易代码_w2v_5</th>\n",
       "      <th>渠道代码_countvec_9</th>\n",
       "      <th>客户编号_渠道代码_w2v_6</th>\n",
       "      <th>摘要信息_countvec_2</th>\n",
       "      <th>摘要信息_countvec_0</th>\n",
       "      <th>渠道代码_countvec_8</th>\n",
       "      <th>交易对手客户编号_countvec_7</th>\n",
       "      <th>摘要信息_tfidf_1</th>\n",
       "      <th>摘要信息_countvec_1</th>\n",
       "      <th>合约账户余额_mean_低频交易日</th>\n",
       "      <th>合约账户余额_min_低频交易日</th>\n",
       "      <th>合约账户余额_max_低频交易日</th>\n",
       "      <th>折人民币交易金额_max_低频交易日</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>182d6a854532dd26a1b111e77bd501f4</td>\n",
       "      <td>168110.405905</td>\n",
       "      <td>10098000.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>183601.93711</td>\n",
       "      <td>10098000.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>121635.812291</td>\n",
       "      <td>2146000.0</td>\n",
       "      <td>0.063961</td>\n",
       "      <td>1.044394</td>\n",
       "      <td>2.142105</td>\n",
       "      <td>0.299675</td>\n",
       "      <td>0.373661</td>\n",
       "      <td>1.926746</td>\n",
       "      <td>0.042918</td>\n",
       "      <td>0</td>\n",
       "      <td>12186.1</td>\n",
       "      <td>477.0</td>\n",
       "      <td>25.547379</td>\n",
       "      <td>23.974717</td>\n",
       "      <td>146.52</td>\n",
       "      <td>6168.06</td>\n",
       "      <td>159.0</td>\n",
       "      <td>38.79283</td>\n",
       "      <td>35.191651</td>\n",
       "      <td>184.61</td>\n",
       "      <td>-6018.04</td>\n",
       "      <td>18354.16</td>\n",
       "      <td>636.0</td>\n",
       "      <td>4173.36</td>\n",
       "      <td>162.0</td>\n",
       "      <td>1667.31</td>\n",
       "      <td>28.0</td>\n",
       "      <td>-2506.05</td>\n",
       "      <td>5840.67</td>\n",
       "      <td>190.0</td>\n",
       "      <td>3410.06</td>\n",
       "      <td>131.0</td>\n",
       "      <td>2026.35</td>\n",
       "      <td>40.0</td>\n",
       "      <td>-1383.71</td>\n",
       "      <td>5436.41</td>\n",
       "      <td>171.0</td>\n",
       "      <td>-359.04</td>\n",
       "      <td>763.3</td>\n",
       "      <td>404.26</td>\n",
       "      <td>31.0</td>\n",
       "      <td>-12.0</td>\n",
       "      <td>19.0</td>\n",
       "      <td>39.0</td>\n",
       "      <td>12186.1</td>\n",
       "      <td>477.0</td>\n",
       "      <td>6168.06</td>\n",
       "      <td>159.0</td>\n",
       "      <td>-6018.04</td>\n",
       "      <td>18354.16</td>\n",
       "      <td>636.0</td>\n",
       "      <td>669.24</td>\n",
       "      <td>33.0</td>\n",
       "      <td>184.61</td>\n",
       "      <td>0.0</td>\n",
       "      <td>28.858742</td>\n",
       "      <td>27.784195</td>\n",
       "      <td>108.330579</td>\n",
       "      <td>187.79</td>\n",
       "      <td>39.166084</td>\n",
       "      <td>99.186608</td>\n",
       "      <td>166.73</td>\n",
       "      <td>44.801018</td>\n",
       "      <td>9.143971</td>\n",
       "      <td>21.06</td>\n",
       "      <td>18.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>14.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>-0.028939</td>\n",
       "      <td>-0.216109</td>\n",
       "      <td>0.79734</td>\n",
       "      <td>-0.057123</td>\n",
       "      <td>0.057298</td>\n",
       "      <td>-1.620721</td>\n",
       "      <td>-0.050025</td>\n",
       "      <td>-3.190668</td>\n",
       "      <td>166.624821</td>\n",
       "      <td>-6.69274</td>\n",
       "      <td>-0.124638</td>\n",
       "      <td>0.677956</td>\n",
       "      <td>169.231007</td>\n",
       "      <td>60.462121</td>\n",
       "      <td>0.0</td>\n",
       "      <td>185.67</td>\n",
       "      <td>68.01</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               客户编号   总交易金额占比_mean  总交易金额占比_max  总交易金额占比_min  \\\n",
       "0  182d6a854532dd26a1b111e77bd501f4  168110.405905   10098000.0          0.0   \n",
       "\n",
       "    转入金额占比_mean  转入金额占比_max  转入金额占比_min    转出金额占比_mean  转出金额占比_max  \\\n",
       "0  183601.93711  10098000.0         0.0  121635.812291   2146000.0   \n",
       "\n",
       "   转出金额占比_min  本人金额占比_mean  本人金额占比_max  本人金额占比_min  非本人金额占比_mean  非本人金额占比_max  \\\n",
       "0    0.063961     1.044394    2.142105    0.299675      0.373661     1.926746   \n",
       "\n",
       "   非本人金额占比_min  交易次数小于等于5    总流出金额  总流出笔数     流出平均金额     流出金额方差  流出金额最大值  \\\n",
       "0     0.042918          0  12186.1  477.0  25.547379  23.974717   146.52   \n",
       "\n",
       "     总流入金额  总流入笔数    流入平均金额     流入金额方差  流入金额最大值      总净流       总金额    总笔数  \\\n",
       "0  6168.06  159.0  38.79283  35.191651   184.61 -6018.04  18354.16  636.0   \n",
       "\n",
       "   近一月流出金额  近一月流出笔数  近一月流入金额  近一月流入笔数   近一月总净流   近一月总金额  近一月总笔数  倒数第三月流出金额  \\\n",
       "0  4173.36    162.0  1667.31     28.0 -2506.05  5840.67   190.0    3410.06   \n",
       "\n",
       "   倒数第三月流出笔数  倒数第三月流入金额  倒数第三月流入笔数  倒数第三月总净流  倒数第三月总金额  倒数第三月总笔数  \\\n",
       "0      131.0    2026.35       40.0  -1383.71   5436.41     171.0   \n",
       "\n",
       "   第三个月与第一个月流入金额差  第三个月与第一个月流出金额差  第三个月与第一个月总金额差  第三个月与第一个月流出笔数差  \\\n",
       "0         -359.04           763.3         404.26            31.0   \n",
       "\n",
       "   第三个月与第一个月流入笔数差  第三个月与第一个月总笔数差  相关客户数  最后交易日流出金额  最后交易日流出笔数  最后交易日流入金额  \\\n",
       "0           -12.0           19.0   39.0    12186.1      477.0    6168.06   \n",
       "\n",
       "   最后交易日流入笔数  最后交易日总净流  最后交易日总金额  最后交易日总笔数  非工作日交易金额  非工作日交易笔数  企业交易绝对值最高金额  \\\n",
       "0      159.0  -6018.04  18354.16     636.0    669.24      33.0       184.61   \n",
       "\n",
       "   企业交易绝对值最低金额  企业交易绝对值_mean  企业交易绝对值_std   近一月平均账户余额  近一月最大账户余额  近一月账户余额方差  \\\n",
       "0          0.0     28.858742    27.784195  108.330579     187.79  39.166084   \n",
       "\n",
       "   倒数第三个月平均账户余额  倒数第三个月最大账户余额  倒数第三个月账户余额方差  第三个月与第一个月余额均值差  第三个月与第一个月余额最大差  \\\n",
       "0     99.186608        166.73     44.801018        9.143971           21.06   \n",
       "\n",
       "   近一月交易代码个数  近一月渠道代码个数  倒数第三月交易代码个数  倒数第三月渠道代码个数  第三个月与第一个月渠道数差  \\\n",
       "0       18.0       10.0         14.0          8.0            2.0   \n",
       "\n",
       "   第三个月与第一个月交易代码数差  交易代码_tfidf_8  摘要信息_countvec_3  客户编号_摘要信息_w2v_0  \\\n",
       "0              4.0     -0.028939        -0.216109          0.79734   \n",
       "\n",
       "   客户编号_交易代码_w2v_2  客户编号_交易代码_w2v_5  渠道代码_countvec_9  客户编号_渠道代码_w2v_6  \\\n",
       "0        -0.057123         0.057298        -1.620721        -0.050025   \n",
       "\n",
       "   摘要信息_countvec_2  摘要信息_countvec_0  渠道代码_countvec_8  交易对手客户编号_countvec_7  \\\n",
       "0        -3.190668       166.624821         -6.69274            -0.124638   \n",
       "\n",
       "   摘要信息_tfidf_1  摘要信息_countvec_1  合约账户余额_mean_低频交易日  合约账户余额_min_低频交易日  \\\n",
       "0      0.677956       169.231007          60.462121               0.0   \n",
       "\n",
       "   合约账户余额_max_低频交易日  折人民币交易金额_max_低频交易日  \n",
       "0            185.67               68.01  "
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "AA_tr = feature_hyy[['客户编号', '总交易金额占比_mean', '总交易金额占比_max', '总交易金额占比_min', '转入金额占比_mean',\n",
    "       '转入金额占比_max', '转入金额占比_min', '转出金额占比_mean', '转出金额占比_max', '转出金额占比_min',\n",
    "       '本人金额占比_mean', '本人金额占比_max', '本人金额占比_min', '非本人金额占比_mean',\n",
    "       '非本人金额占比_max', '非本人金额占比_min', '交易次数小于等于5', '总流出金额', '总流出笔数', '流出平均金额',\n",
    "       '流出金额方差', '流出金额最大值', '总流入金额', '总流入笔数', '流入平均金额', '流入金额方差', '流入金额最大值',\n",
    "       '总净流', '总金额', '总笔数', '近一月流出金额', '近一月流出笔数', '近一月流入金额', '近一月流入笔数',\n",
    "       '近一月总净流', '近一月总金额', '近一月总笔数', '倒数第三月流出金额', '倒数第三月流出笔数', '倒数第三月流入金额',\n",
    "       '倒数第三月流入笔数', '倒数第三月总净流', '倒数第三月总金额', '倒数第三月总笔数', '第三个月与第一个月流入金额差',\n",
    "       '第三个月与第一个月流出金额差', '第三个月与第一个月总金额差', '第三个月与第一个月流出笔数差', '第三个月与第一个月流入笔数差',\n",
    "       '第三个月与第一个月总笔数差', '相关客户数', '最后交易日流出金额', '最后交易日流出笔数', '最后交易日流入金额',\n",
    "       '最后交易日流入笔数', '最后交易日总净流', '最后交易日总金额', '最后交易日总笔数', '非工作日交易金额', '非工作日交易笔数',\n",
    "       '企业交易绝对值最高金额', '企业交易绝对值最低金额', '企业交易绝对值_mean', '企业交易绝对值_std',\n",
    "       '近一月平均账户余额', '近一月最大账户余额', '近一月账户余额方差', '倒数第三个月平均账户余额', '倒数第三个月最大账户余额',\n",
    "       '倒数第三个月账户余额方差', '第三个月与第一个月余额均值差', '第三个月与第一个月余额最大差', '近一月交易代码个数',\n",
    "       '近一月渠道代码个数', '倒数第三月交易代码个数', '倒数第三月渠道代码个数', '第三个月与第一个月渠道数差',\n",
    "       '第三个月与第一个月交易代码数差', '交易代码_tfidf_8', '摘要信息_countvec_3', '客户编号_摘要信息_w2v_0',\n",
    "       '客户编号_交易代码_w2v_2', '客户编号_交易代码_w2v_5', '渠道代码_countvec_9',\n",
    "       '客户编号_渠道代码_w2v_6', '摘要信息_countvec_2', '摘要信息_countvec_0',\n",
    "       '渠道代码_countvec_8', '交易对手客户编号_countvec_7', '摘要信息_tfidf_1',\n",
    "       '摘要信息_countvec_1', '合约账户余额_mean_低频交易日', '合约账户余额_min_低频交易日',\n",
    "       '合约账户余额_max_低频交易日', '折人民币交易金额_max_低频交易日']]\n",
    "AA_tr.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "80122719-8aeb-45f8-b249-f0c40d94a007",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-11T03:19:37.045952Z",
     "iopub.status.busy": "2024-11-11T03:19:37.045653Z",
     "iopub.status.idle": "2024-11-11T03:19:37.116477Z",
     "msg_id": "ac0b7619-045c-4dba-b933-5b38b5459ba2",
     "shell.execute_reply": "2024-11-11T03:19:37.115865Z",
     "shell.execute_reply.started": "2024-11-11T03:19:37.045925Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(59079, 1)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>客户编号</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>158a8d99bec2a2b652a6de45a2b52ec9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>b1d244a25a82adb7beafe33fe971402c</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>85b1ab1270516d2ebe21ed00c6abbf27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ef194610bdbecdea9af3cc23bceba8b2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1220f9592fdd0b3fa9bbbd90e6d69d84</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               客户编号\n",
       "0  158a8d99bec2a2b652a6de45a2b52ec9\n",
       "1  b1d244a25a82adb7beafe33fe971402c\n",
       "2  85b1ab1270516d2ebe21ed00c6abbf27\n",
       "3  ef194610bdbecdea9af3cc23bceba8b2\n",
       "4  1220f9592fdd0b3fa9bbbd90e6d69d84"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "file_name = 'XW_ENTINFO_TARGET'\n",
    "TARGET = get_data(file_name, num_rows=None)\n",
    "TARGET = TARGET.drop(['数据日期', 'FLAG', 'is_train'], axis = 1)\n",
    "print(TARGET.shape)\n",
    "TARGET.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "58731b05-46ee-42d1-8ddf-f2dcfc47158f",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-11T03:19:37.117824Z",
     "iopub.status.busy": "2024-11-11T03:19:37.117335Z",
     "iopub.status.idle": "2024-11-11T03:19:37.335163Z",
     "msg_id": "dcfd042e-2c7a-4a82-b9d5-991480ef1131",
     "shell.execute_reply": "2024-11-11T03:19:37.334527Z",
     "shell.execute_reply.started": "2024-11-11T03:19:37.117799Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>客户编号</th>\n",
       "      <th>总交易金额占比_mean</th>\n",
       "      <th>总交易金额占比_max</th>\n",
       "      <th>总交易金额占比_min</th>\n",
       "      <th>转入金额占比_mean</th>\n",
       "      <th>转入金额占比_max</th>\n",
       "      <th>转入金额占比_min</th>\n",
       "      <th>转出金额占比_mean</th>\n",
       "      <th>转出金额占比_max</th>\n",
       "      <th>转出金额占比_min</th>\n",
       "      <th>本人金额占比_mean</th>\n",
       "      <th>本人金额占比_max</th>\n",
       "      <th>本人金额占比_min</th>\n",
       "      <th>非本人金额占比_mean</th>\n",
       "      <th>非本人金额占比_max</th>\n",
       "      <th>非本人金额占比_min</th>\n",
       "      <th>交易次数小于等于5</th>\n",
       "      <th>总流出金额</th>\n",
       "      <th>总流出笔数</th>\n",
       "      <th>流出平均金额</th>\n",
       "      <th>流出金额方差</th>\n",
       "      <th>流出金额最大值</th>\n",
       "      <th>总流入金额</th>\n",
       "      <th>总流入笔数</th>\n",
       "      <th>流入平均金额</th>\n",
       "      <th>流入金额方差</th>\n",
       "      <th>流入金额最大值</th>\n",
       "      <th>总净流</th>\n",
       "      <th>总金额</th>\n",
       "      <th>总笔数</th>\n",
       "      <th>近一月流出金额</th>\n",
       "      <th>近一月流出笔数</th>\n",
       "      <th>近一月流入金额</th>\n",
       "      <th>近一月流入笔数</th>\n",
       "      <th>近一月总净流</th>\n",
       "      <th>近一月总金额</th>\n",
       "      <th>近一月总笔数</th>\n",
       "      <th>倒数第三月流出金额</th>\n",
       "      <th>倒数第三月流出笔数</th>\n",
       "      <th>倒数第三月流入金额</th>\n",
       "      <th>倒数第三月流入笔数</th>\n",
       "      <th>倒数第三月总净流</th>\n",
       "      <th>倒数第三月总金额</th>\n",
       "      <th>倒数第三月总笔数</th>\n",
       "      <th>第三个月与第一个月流入金额差</th>\n",
       "      <th>第三个月与第一个月流出金额差</th>\n",
       "      <th>第三个月与第一个月总金额差</th>\n",
       "      <th>第三个月与第一个月流出笔数差</th>\n",
       "      <th>第三个月与第一个月流入笔数差</th>\n",
       "      <th>第三个月与第一个月总笔数差</th>\n",
       "      <th>相关客户数</th>\n",
       "      <th>最后交易日流出金额</th>\n",
       "      <th>最后交易日流出笔数</th>\n",
       "      <th>最后交易日流入金额</th>\n",
       "      <th>最后交易日流入笔数</th>\n",
       "      <th>最后交易日总净流</th>\n",
       "      <th>最后交易日总金额</th>\n",
       "      <th>最后交易日总笔数</th>\n",
       "      <th>非工作日交易金额</th>\n",
       "      <th>非工作日交易笔数</th>\n",
       "      <th>企业交易绝对值最高金额</th>\n",
       "      <th>企业交易绝对值最低金额</th>\n",
       "      <th>企业交易绝对值_mean</th>\n",
       "      <th>企业交易绝对值_std</th>\n",
       "      <th>近一月平均账户余额</th>\n",
       "      <th>近一月最大账户余额</th>\n",
       "      <th>近一月账户余额方差</th>\n",
       "      <th>倒数第三个月平均账户余额</th>\n",
       "      <th>倒数第三个月最大账户余额</th>\n",
       "      <th>倒数第三个月账户余额方差</th>\n",
       "      <th>第三个月与第一个月余额均值差</th>\n",
       "      <th>第三个月与第一个月余额最大差</th>\n",
       "      <th>近一月交易代码个数</th>\n",
       "      <th>近一月渠道代码个数</th>\n",
       "      <th>倒数第三月交易代码个数</th>\n",
       "      <th>倒数第三月渠道代码个数</th>\n",
       "      <th>第三个月与第一个月渠道数差</th>\n",
       "      <th>第三个月与第一个月交易代码数差</th>\n",
       "      <th>交易代码_tfidf_8</th>\n",
       "      <th>摘要信息_countvec_3</th>\n",
       "      <th>客户编号_摘要信息_w2v_0</th>\n",
       "      <th>客户编号_交易代码_w2v_2</th>\n",
       "      <th>客户编号_交易代码_w2v_5</th>\n",
       "      <th>渠道代码_countvec_9</th>\n",
       "      <th>客户编号_渠道代码_w2v_6</th>\n",
       "      <th>摘要信息_countvec_2</th>\n",
       "      <th>摘要信息_countvec_0</th>\n",
       "      <th>渠道代码_countvec_8</th>\n",
       "      <th>交易对手客户编号_countvec_7</th>\n",
       "      <th>摘要信息_tfidf_1</th>\n",
       "      <th>摘要信息_countvec_1</th>\n",
       "      <th>合约账户余额_mean_低频交易日</th>\n",
       "      <th>合约账户余额_min_低频交易日</th>\n",
       "      <th>合约账户余额_max_低频交易日</th>\n",
       "      <th>折人民币交易金额_max_低频交易日</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>158a8d99bec2a2b652a6de45a2b52ec9</td>\n",
       "      <td>141147.721138</td>\n",
       "      <td>1.740000e+06</td>\n",
       "      <td>0.014808</td>\n",
       "      <td>161000.670796</td>\n",
       "      <td>1.183000e+06</td>\n",
       "      <td>0.014808</td>\n",
       "      <td>1.160007e+05</td>\n",
       "      <td>1.740000e+06</td>\n",
       "      <td>0.147927</td>\n",
       "      <td>1.629809</td>\n",
       "      <td>2.799176</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.985128</td>\n",
       "      <td>0.985128</td>\n",
       "      <td>0.985128</td>\n",
       "      <td>0</td>\n",
       "      <td>583.82</td>\n",
       "      <td>19.0</td>\n",
       "      <td>30.727368</td>\n",
       "      <td>38.514283</td>\n",
       "      <td>106.75</td>\n",
       "      <td>418.88</td>\n",
       "      <td>15.0</td>\n",
       "      <td>27.925333</td>\n",
       "      <td>36.134326</td>\n",
       "      <td>106.75</td>\n",
       "      <td>-164.94</td>\n",
       "      <td>1002.70</td>\n",
       "      <td>34.0</td>\n",
       "      <td>130.15</td>\n",
       "      <td>5.0</td>\n",
       "      <td>78.81</td>\n",
       "      <td>2.0</td>\n",
       "      <td>-51.34</td>\n",
       "      <td>208.96</td>\n",
       "      <td>7.0</td>\n",
       "      <td>388.28</td>\n",
       "      <td>9.0</td>\n",
       "      <td>213.50</td>\n",
       "      <td>2.0</td>\n",
       "      <td>-174.78</td>\n",
       "      <td>601.78</td>\n",
       "      <td>11.0</td>\n",
       "      <td>-134.69</td>\n",
       "      <td>-258.13</td>\n",
       "      <td>-392.82</td>\n",
       "      <td>-4.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-4.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>69.60</td>\n",
       "      <td>4.0</td>\n",
       "      <td>106.75</td>\n",
       "      <td>1.24</td>\n",
       "      <td>29.491176</td>\n",
       "      <td>36.946221</td>\n",
       "      <td>30.410000</td>\n",
       "      <td>75.98</td>\n",
       "      <td>21.636022</td>\n",
       "      <td>69.550909</td>\n",
       "      <td>106.75</td>\n",
       "      <td>26.308696</td>\n",
       "      <td>-39.140909</td>\n",
       "      <td>-30.77</td>\n",
       "      <td>6.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>-0.074365</td>\n",
       "      <td>-0.002396</td>\n",
       "      <td>-0.327669</td>\n",
       "      <td>-0.349541</td>\n",
       "      <td>-0.285305</td>\n",
       "      <td>-0.612251</td>\n",
       "      <td>0.061953</td>\n",
       "      <td>-0.020211</td>\n",
       "      <td>23.004029</td>\n",
       "      <td>-0.079323</td>\n",
       "      <td>-0.010378</td>\n",
       "      <td>-0.020252</td>\n",
       "      <td>0.920446</td>\n",
       "      <td>18.545000</td>\n",
       "      <td>0.00</td>\n",
       "      <td>39.38</td>\n",
       "      <td>17.40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>b1d244a25a82adb7beafe33fe971402c</td>\n",
       "      <td>200375.469589</td>\n",
       "      <td>1.603000e+06</td>\n",
       "      <td>0.089653</td>\n",
       "      <td>0.595222</td>\n",
       "      <td>9.999994e-01</td>\n",
       "      <td>0.354194</td>\n",
       "      <td>5.343336e+05</td>\n",
       "      <td>1.603000e+06</td>\n",
       "      <td>0.089653</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>66.60</td>\n",
       "      <td>5.0</td>\n",
       "      <td>13.320000</td>\n",
       "      <td>5.959631</td>\n",
       "      <td>16.03</td>\n",
       "      <td>34.72</td>\n",
       "      <td>3.0</td>\n",
       "      <td>11.573333</td>\n",
       "      <td>7.719173</td>\n",
       "      <td>16.03</td>\n",
       "      <td>-31.88</td>\n",
       "      <td>101.32</td>\n",
       "      <td>8.0</td>\n",
       "      <td>18.69</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.66</td>\n",
       "      <td>1.0</td>\n",
       "      <td>-16.03</td>\n",
       "      <td>21.35</td>\n",
       "      <td>3.0</td>\n",
       "      <td>32.06</td>\n",
       "      <td>2.0</td>\n",
       "      <td>32.06</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>64.12</td>\n",
       "      <td>4.0</td>\n",
       "      <td>-29.40</td>\n",
       "      <td>-13.37</td>\n",
       "      <td>-42.77</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15.85</td>\n",
       "      <td>1.0</td>\n",
       "      <td>16.03</td>\n",
       "      <td>2.66</td>\n",
       "      <td>12.665000</td>\n",
       "      <td>6.175532</td>\n",
       "      <td>21.733333</td>\n",
       "      <td>29.67</td>\n",
       "      <td>12.345365</td>\n",
       "      <td>17.582500</td>\n",
       "      <td>31.10</td>\n",
       "      <td>13.239256</td>\n",
       "      <td>4.150833</td>\n",
       "      <td>-1.43</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>-0.086773</td>\n",
       "      <td>0.000041</td>\n",
       "      <td>-0.670497</td>\n",
       "      <td>-0.504787</td>\n",
       "      <td>-0.342732</td>\n",
       "      <td>-0.156541</td>\n",
       "      <td>0.071143</td>\n",
       "      <td>0.002159</td>\n",
       "      <td>6.001223</td>\n",
       "      <td>0.085791</td>\n",
       "      <td>-0.002414</td>\n",
       "      <td>-0.114354</td>\n",
       "      <td>-0.012561</td>\n",
       "      <td>29.660000</td>\n",
       "      <td>29.66</td>\n",
       "      <td>29.66</td>\n",
       "      <td>15.85</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>85b1ab1270516d2ebe21ed00c6abbf27</td>\n",
       "      <td>1.003761</td>\n",
       "      <td>1.045173e+01</td>\n",
       "      <td>0.025224</td>\n",
       "      <td>1.105384</td>\n",
       "      <td>1.045173e+01</td>\n",
       "      <td>0.025224</td>\n",
       "      <td>8.239674e-01</td>\n",
       "      <td>9.995093e-01</td>\n",
       "      <td>0.068707</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.197963</td>\n",
       "      <td>10.451731</td>\n",
       "      <td>0.278051</td>\n",
       "      <td>0</td>\n",
       "      <td>1401.44</td>\n",
       "      <td>46.0</td>\n",
       "      <td>30.466087</td>\n",
       "      <td>20.961076</td>\n",
       "      <td>85.69</td>\n",
       "      <td>1181.79</td>\n",
       "      <td>26.0</td>\n",
       "      <td>45.453462</td>\n",
       "      <td>14.235373</td>\n",
       "      <td>74.85</td>\n",
       "      <td>-219.65</td>\n",
       "      <td>2583.23</td>\n",
       "      <td>72.0</td>\n",
       "      <td>318.77</td>\n",
       "      <td>13.0</td>\n",
       "      <td>381.63</td>\n",
       "      <td>9.0</td>\n",
       "      <td>62.86</td>\n",
       "      <td>700.40</td>\n",
       "      <td>22.0</td>\n",
       "      <td>856.22</td>\n",
       "      <td>26.0</td>\n",
       "      <td>580.31</td>\n",
       "      <td>12.0</td>\n",
       "      <td>-275.91</td>\n",
       "      <td>1436.53</td>\n",
       "      <td>38.0</td>\n",
       "      <td>-198.68</td>\n",
       "      <td>-537.45</td>\n",
       "      <td>-736.13</td>\n",
       "      <td>-13.0</td>\n",
       "      <td>-3.0</td>\n",
       "      <td>-16.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>1401.44</td>\n",
       "      <td>46.0</td>\n",
       "      <td>1181.79</td>\n",
       "      <td>26.0</td>\n",
       "      <td>-219.65</td>\n",
       "      <td>2583.23</td>\n",
       "      <td>72.0</td>\n",
       "      <td>492.75</td>\n",
       "      <td>15.0</td>\n",
       "      <td>85.69</td>\n",
       "      <td>1.24</td>\n",
       "      <td>35.878194</td>\n",
       "      <td>20.059370</td>\n",
       "      <td>46.608636</td>\n",
       "      <td>86.19</td>\n",
       "      <td>20.001098</td>\n",
       "      <td>45.889474</td>\n",
       "      <td>87.49</td>\n",
       "      <td>19.815098</td>\n",
       "      <td>0.719163</td>\n",
       "      <td>-1.30</td>\n",
       "      <td>9.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>7.0</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.138988</td>\n",
       "      <td>-0.025896</td>\n",
       "      <td>0.914132</td>\n",
       "      <td>-0.204240</td>\n",
       "      <td>-0.197763</td>\n",
       "      <td>1.939829</td>\n",
       "      <td>-0.040857</td>\n",
       "      <td>-0.380561</td>\n",
       "      <td>12.086482</td>\n",
       "      <td>4.668638</td>\n",
       "      <td>-0.006806</td>\n",
       "      <td>0.628993</td>\n",
       "      <td>22.356446</td>\n",
       "      <td>50.120667</td>\n",
       "      <td>29.40</td>\n",
       "      <td>79.02</td>\n",
       "      <td>70.68</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ef194610bdbecdea9af3cc23bceba8b2</td>\n",
       "      <td>857195.157298</td>\n",
       "      <td>2.757700e+07</td>\n",
       "      <td>0.003564</td>\n",
       "      <td>459414.600553</td>\n",
       "      <td>2.757700e+07</td>\n",
       "      <td>0.003564</td>\n",
       "      <td>1.750457e+06</td>\n",
       "      <td>2.757700e+07</td>\n",
       "      <td>0.090624</td>\n",
       "      <td>0.651170</td>\n",
       "      <td>0.994141</td>\n",
       "      <td>0.352156</td>\n",
       "      <td>0.632939</td>\n",
       "      <td>4.278525</td>\n",
       "      <td>0.102491</td>\n",
       "      <td>0</td>\n",
       "      <td>7212.01</td>\n",
       "      <td>128.0</td>\n",
       "      <td>56.343828</td>\n",
       "      <td>83.971828</td>\n",
       "      <td>397.73</td>\n",
       "      <td>5914.57</td>\n",
       "      <td>57.0</td>\n",
       "      <td>103.764386</td>\n",
       "      <td>97.698285</td>\n",
       "      <td>397.73</td>\n",
       "      <td>-1297.44</td>\n",
       "      <td>13126.58</td>\n",
       "      <td>185.0</td>\n",
       "      <td>980.43</td>\n",
       "      <td>36.0</td>\n",
       "      <td>471.99</td>\n",
       "      <td>7.0</td>\n",
       "      <td>-508.44</td>\n",
       "      <td>1452.42</td>\n",
       "      <td>43.0</td>\n",
       "      <td>5494.37</td>\n",
       "      <td>61.0</td>\n",
       "      <td>4929.08</td>\n",
       "      <td>36.0</td>\n",
       "      <td>-565.29</td>\n",
       "      <td>10423.45</td>\n",
       "      <td>97.0</td>\n",
       "      <td>-4457.09</td>\n",
       "      <td>-4513.94</td>\n",
       "      <td>-8971.03</td>\n",
       "      <td>-25.0</td>\n",
       "      <td>-29.0</td>\n",
       "      <td>-54.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>7212.01</td>\n",
       "      <td>128.0</td>\n",
       "      <td>5914.57</td>\n",
       "      <td>57.0</td>\n",
       "      <td>-1297.44</td>\n",
       "      <td>13126.58</td>\n",
       "      <td>185.0</td>\n",
       "      <td>1241.18</td>\n",
       "      <td>39.0</td>\n",
       "      <td>397.73</td>\n",
       "      <td>1.24</td>\n",
       "      <td>70.954486</td>\n",
       "      <td>90.850831</td>\n",
       "      <td>64.904419</td>\n",
       "      <td>117.28</td>\n",
       "      <td>17.986447</td>\n",
       "      <td>135.555567</td>\n",
       "      <td>398.12</td>\n",
       "      <td>125.497190</td>\n",
       "      <td>-70.651148</td>\n",
       "      <td>-280.84</td>\n",
       "      <td>9.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>7.0</td>\n",
       "      <td>-3.0</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>-0.096726</td>\n",
       "      <td>-0.004582</td>\n",
       "      <td>-0.261908</td>\n",
       "      <td>-0.143365</td>\n",
       "      <td>-0.145577</td>\n",
       "      <td>8.922879</td>\n",
       "      <td>-0.014322</td>\n",
       "      <td>0.026084</td>\n",
       "      <td>78.021008</td>\n",
       "      <td>-24.824091</td>\n",
       "      <td>-0.039502</td>\n",
       "      <td>0.002599</td>\n",
       "      <td>1.274078</td>\n",
       "      <td>38.795641</td>\n",
       "      <td>0.00</td>\n",
       "      <td>67.80</td>\n",
       "      <td>59.41</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1220f9592fdd0b3fa9bbbd90e6d69d84</td>\n",
       "      <td>175351.066009</td>\n",
       "      <td>1.672000e+06</td>\n",
       "      <td>0.089402</td>\n",
       "      <td>229376.699158</td>\n",
       "      <td>1.070000e+06</td>\n",
       "      <td>0.920726</td>\n",
       "      <td>1.393340e+05</td>\n",
       "      <td>1.672000e+06</td>\n",
       "      <td>0.089402</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.431314</td>\n",
       "      <td>2.761791</td>\n",
       "      <td>0.985790</td>\n",
       "      <td>0</td>\n",
       "      <td>133.40</td>\n",
       "      <td>8.0</td>\n",
       "      <td>16.675000</td>\n",
       "      <td>13.400853</td>\n",
       "      <td>46.84</td>\n",
       "      <td>170.36</td>\n",
       "      <td>12.0</td>\n",
       "      <td>14.196667</td>\n",
       "      <td>11.429948</td>\n",
       "      <td>47.16</td>\n",
       "      <td>36.96</td>\n",
       "      <td>303.76</td>\n",
       "      <td>20.0</td>\n",
       "      <td>18.23</td>\n",
       "      <td>2.0</td>\n",
       "      <td>18.16</td>\n",
       "      <td>2.0</td>\n",
       "      <td>-0.07</td>\n",
       "      <td>36.39</td>\n",
       "      <td>4.0</td>\n",
       "      <td>98.63</td>\n",
       "      <td>5.0</td>\n",
       "      <td>135.65</td>\n",
       "      <td>9.0</td>\n",
       "      <td>37.02</td>\n",
       "      <td>234.28</td>\n",
       "      <td>14.0</td>\n",
       "      <td>-117.49</td>\n",
       "      <td>-80.40</td>\n",
       "      <td>-197.89</td>\n",
       "      <td>-3.0</td>\n",
       "      <td>-7.0</td>\n",
       "      <td>-10.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>153.75</td>\n",
       "      <td>11.0</td>\n",
       "      <td>47.16</td>\n",
       "      <td>1.51</td>\n",
       "      <td>15.188000</td>\n",
       "      <td>11.972871</td>\n",
       "      <td>10.155000</td>\n",
       "      <td>16.89</td>\n",
       "      <td>7.911546</td>\n",
       "      <td>15.103571</td>\n",
       "      <td>47.57</td>\n",
       "      <td>13.257565</td>\n",
       "      <td>-4.948571</td>\n",
       "      <td>-30.68</td>\n",
       "      <td>3.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.188417</td>\n",
       "      <td>-0.000601</td>\n",
       "      <td>-0.813936</td>\n",
       "      <td>-0.492736</td>\n",
       "      <td>-0.404158</td>\n",
       "      <td>-0.121278</td>\n",
       "      <td>0.042085</td>\n",
       "      <td>0.003389</td>\n",
       "      <td>14.001670</td>\n",
       "      <td>2.227816</td>\n",
       "      <td>-0.005464</td>\n",
       "      <td>-0.079735</td>\n",
       "      <td>-0.014037</td>\n",
       "      <td>15.187273</td>\n",
       "      <td>0.00</td>\n",
       "      <td>47.57</td>\n",
       "      <td>47.16</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               客户编号   总交易金额占比_mean   总交易金额占比_max  总交易金额占比_min  \\\n",
       "0  158a8d99bec2a2b652a6de45a2b52ec9  141147.721138  1.740000e+06     0.014808   \n",
       "1  b1d244a25a82adb7beafe33fe971402c  200375.469589  1.603000e+06     0.089653   \n",
       "2  85b1ab1270516d2ebe21ed00c6abbf27       1.003761  1.045173e+01     0.025224   \n",
       "3  ef194610bdbecdea9af3cc23bceba8b2  857195.157298  2.757700e+07     0.003564   \n",
       "4  1220f9592fdd0b3fa9bbbd90e6d69d84  175351.066009  1.672000e+06     0.089402   \n",
       "\n",
       "     转入金额占比_mean    转入金额占比_max  转入金额占比_min   转出金额占比_mean    转出金额占比_max  \\\n",
       "0  161000.670796  1.183000e+06    0.014808  1.160007e+05  1.740000e+06   \n",
       "1       0.595222  9.999994e-01    0.354194  5.343336e+05  1.603000e+06   \n",
       "2       1.105384  1.045173e+01    0.025224  8.239674e-01  9.995093e-01   \n",
       "3  459414.600553  2.757700e+07    0.003564  1.750457e+06  2.757700e+07   \n",
       "4  229376.699158  1.070000e+06    0.920726  1.393340e+05  1.672000e+06   \n",
       "\n",
       "   转出金额占比_min  本人金额占比_mean  本人金额占比_max  本人金额占比_min  非本人金额占比_mean  非本人金额占比_max  \\\n",
       "0    0.147927     1.629809    2.799176    1.000000      0.985128     0.985128   \n",
       "1    0.089653          NaN         NaN         NaN           NaN          NaN   \n",
       "2    0.068707          NaN         NaN         NaN      1.197963    10.451731   \n",
       "3    0.090624     0.651170    0.994141    0.352156      0.632939     4.278525   \n",
       "4    0.089402          NaN         NaN         NaN      1.431314     2.761791   \n",
       "\n",
       "   非本人金额占比_min  交易次数小于等于5    总流出金额  总流出笔数     流出平均金额     流出金额方差  流出金额最大值  \\\n",
       "0     0.985128          0   583.82   19.0  30.727368  38.514283   106.75   \n",
       "1          NaN          0    66.60    5.0  13.320000   5.959631    16.03   \n",
       "2     0.278051          0  1401.44   46.0  30.466087  20.961076    85.69   \n",
       "3     0.102491          0  7212.01  128.0  56.343828  83.971828   397.73   \n",
       "4     0.985790          0   133.40    8.0  16.675000  13.400853    46.84   \n",
       "\n",
       "     总流入金额  总流入笔数      流入平均金额     流入金额方差  流入金额最大值      总净流       总金额    总笔数  \\\n",
       "0   418.88   15.0   27.925333  36.134326   106.75  -164.94   1002.70   34.0   \n",
       "1    34.72    3.0   11.573333   7.719173    16.03   -31.88    101.32    8.0   \n",
       "2  1181.79   26.0   45.453462  14.235373    74.85  -219.65   2583.23   72.0   \n",
       "3  5914.57   57.0  103.764386  97.698285   397.73 -1297.44  13126.58  185.0   \n",
       "4   170.36   12.0   14.196667  11.429948    47.16    36.96    303.76   20.0   \n",
       "\n",
       "   近一月流出金额  近一月流出笔数  近一月流入金额  近一月流入笔数  近一月总净流   近一月总金额  近一月总笔数  倒数第三月流出金额  \\\n",
       "0   130.15      5.0    78.81      2.0  -51.34   208.96     7.0     388.28   \n",
       "1    18.69      2.0     2.66      1.0  -16.03    21.35     3.0      32.06   \n",
       "2   318.77     13.0   381.63      9.0   62.86   700.40    22.0     856.22   \n",
       "3   980.43     36.0   471.99      7.0 -508.44  1452.42    43.0    5494.37   \n",
       "4    18.23      2.0    18.16      2.0   -0.07    36.39     4.0      98.63   \n",
       "\n",
       "   倒数第三月流出笔数  倒数第三月流入金额  倒数第三月流入笔数  倒数第三月总净流  倒数第三月总金额  倒数第三月总笔数  \\\n",
       "0        9.0     213.50        2.0   -174.78    601.78      11.0   \n",
       "1        2.0      32.06        2.0      0.00     64.12       4.0   \n",
       "2       26.0     580.31       12.0   -275.91   1436.53      38.0   \n",
       "3       61.0    4929.08       36.0   -565.29  10423.45      97.0   \n",
       "4        5.0     135.65        9.0     37.02    234.28      14.0   \n",
       "\n",
       "   第三个月与第一个月流入金额差  第三个月与第一个月流出金额差  第三个月与第一个月总金额差  第三个月与第一个月流出笔数差  \\\n",
       "0         -134.69         -258.13        -392.82            -4.0   \n",
       "1          -29.40          -13.37         -42.77             0.0   \n",
       "2         -198.68         -537.45        -736.13           -13.0   \n",
       "3        -4457.09        -4513.94       -8971.03           -25.0   \n",
       "4         -117.49          -80.40        -197.89            -3.0   \n",
       "\n",
       "   第三个月与第一个月流入笔数差  第三个月与第一个月总笔数差  相关客户数  最后交易日流出金额  最后交易日流出笔数  最后交易日流入金额  \\\n",
       "0             0.0           -4.0    2.0        NaN        NaN        NaN   \n",
       "1            -1.0           -1.0    1.0        NaN        NaN        NaN   \n",
       "2            -3.0          -16.0    6.0    1401.44       46.0    1181.79   \n",
       "3           -29.0          -54.0    5.0    7212.01      128.0    5914.57   \n",
       "4            -7.0          -10.0    2.0        NaN        NaN        NaN   \n",
       "\n",
       "   最后交易日流入笔数  最后交易日总净流  最后交易日总金额  最后交易日总笔数  非工作日交易金额  非工作日交易笔数  企业交易绝对值最高金额  \\\n",
       "0        NaN       NaN       NaN       NaN     69.60       4.0       106.75   \n",
       "1        NaN       NaN       NaN       NaN     15.85       1.0        16.03   \n",
       "2       26.0   -219.65   2583.23      72.0    492.75      15.0        85.69   \n",
       "3       57.0  -1297.44  13126.58     185.0   1241.18      39.0       397.73   \n",
       "4        NaN       NaN       NaN       NaN    153.75      11.0        47.16   \n",
       "\n",
       "   企业交易绝对值最低金额  企业交易绝对值_mean  企业交易绝对值_std  近一月平均账户余额  近一月最大账户余额  近一月账户余额方差  \\\n",
       "0         1.24     29.491176    36.946221  30.410000      75.98  21.636022   \n",
       "1         2.66     12.665000     6.175532  21.733333      29.67  12.345365   \n",
       "2         1.24     35.878194    20.059370  46.608636      86.19  20.001098   \n",
       "3         1.24     70.954486    90.850831  64.904419     117.28  17.986447   \n",
       "4         1.51     15.188000    11.972871  10.155000      16.89   7.911546   \n",
       "\n",
       "   倒数第三个月平均账户余额  倒数第三个月最大账户余额  倒数第三个月账户余额方差  第三个月与第一个月余额均值差  第三个月与第一个月余额最大差  \\\n",
       "0     69.550909        106.75     26.308696      -39.140909          -30.77   \n",
       "1     17.582500         31.10     13.239256        4.150833           -1.43   \n",
       "2     45.889474         87.49     19.815098        0.719163           -1.30   \n",
       "3    135.555567        398.12    125.497190      -70.651148         -280.84   \n",
       "4     15.103571         47.57     13.257565       -4.948571          -30.68   \n",
       "\n",
       "   近一月交易代码个数  近一月渠道代码个数  倒数第三月交易代码个数  倒数第三月渠道代码个数  第三个月与第一个月渠道数差  \\\n",
       "0        6.0        4.0          5.0          4.0            0.0   \n",
       "1        2.0        1.0          1.0          1.0            0.0   \n",
       "2        9.0        6.0          9.0          7.0           -1.0   \n",
       "3        9.0        4.0         10.0          7.0           -3.0   \n",
       "4        3.0        2.0          4.0          3.0           -1.0   \n",
       "\n",
       "   第三个月与第一个月交易代码数差  交易代码_tfidf_8  摘要信息_countvec_3  客户编号_摘要信息_w2v_0  \\\n",
       "0              1.0     -0.074365        -0.002396        -0.327669   \n",
       "1              1.0     -0.086773         0.000041        -0.670497   \n",
       "2              0.0      0.138988        -0.025896         0.914132   \n",
       "3             -1.0     -0.096726        -0.004582        -0.261908   \n",
       "4             -1.0      0.188417        -0.000601        -0.813936   \n",
       "\n",
       "   客户编号_交易代码_w2v_2  客户编号_交易代码_w2v_5  渠道代码_countvec_9  客户编号_渠道代码_w2v_6  \\\n",
       "0        -0.349541        -0.285305        -0.612251         0.061953   \n",
       "1        -0.504787        -0.342732        -0.156541         0.071143   \n",
       "2        -0.204240        -0.197763         1.939829        -0.040857   \n",
       "3        -0.143365        -0.145577         8.922879        -0.014322   \n",
       "4        -0.492736        -0.404158        -0.121278         0.042085   \n",
       "\n",
       "   摘要信息_countvec_2  摘要信息_countvec_0  渠道代码_countvec_8  交易对手客户编号_countvec_7  \\\n",
       "0        -0.020211        23.004029        -0.079323            -0.010378   \n",
       "1         0.002159         6.001223         0.085791            -0.002414   \n",
       "2        -0.380561        12.086482         4.668638            -0.006806   \n",
       "3         0.026084        78.021008       -24.824091            -0.039502   \n",
       "4         0.003389        14.001670         2.227816            -0.005464   \n",
       "\n",
       "   摘要信息_tfidf_1  摘要信息_countvec_1  合约账户余额_mean_低频交易日  合约账户余额_min_低频交易日  \\\n",
       "0     -0.020252         0.920446          18.545000              0.00   \n",
       "1     -0.114354        -0.012561          29.660000             29.66   \n",
       "2      0.628993        22.356446          50.120667             29.40   \n",
       "3      0.002599         1.274078          38.795641              0.00   \n",
       "4     -0.079735        -0.014037          15.187273              0.00   \n",
       "\n",
       "   合约账户余额_max_低频交易日  折人民币交易金额_max_低频交易日  \n",
       "0             39.38               17.40  \n",
       "1             29.66               15.85  \n",
       "2             79.02               70.68  \n",
       "3             67.80               59.41  \n",
       "4             47.57               47.16  "
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "TARGET = TARGET.merge(AA_tr, on = '客户编号', how = 'left')\n",
    "TARGET.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "09edec50-c523-4f73-98a6-a232ba51078e",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-11T03:19:37.336391Z",
     "iopub.status.busy": "2024-11-11T03:19:37.336078Z",
     "iopub.status.idle": "2024-11-11T03:19:37.427026Z",
     "msg_id": "464a8fd7-4d2f-41da-a598-53fc370f04b9",
     "shell.execute_reply": "2024-11-11T03:19:37.426363Z",
     "shell.execute_reply.started": "2024-11-11T03:19:37.336363Z"
    }
   },
   "outputs": [],
   "source": [
    "TARGET.to_pickle(\"../data/交易流水表_原本数据加文本特征_A榜.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "c5f21872-afae-41b5-a6d1-aafd0188d025",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-11T03:20:07.698917Z",
     "iopub.status.busy": "2024-11-11T03:20:07.698418Z",
     "iopub.status.idle": "2024-11-11T03:20:07.828587Z",
     "msg_id": "36448a80-d99d-45d4-8619-a3ba995f936e",
     "shell.execute_reply": "2024-11-11T03:20:07.827884Z",
     "shell.execute_reply.started": "2024-11-11T03:20:07.698880Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# df = pd.read_pickle(\"../data/交易流水表_原本数据加文本特征_A榜.pkl\")\n",
    "# o_df = pd.read_pickle(\"/home/mole/work/heyuyang/1031/交易流水表业务加文本_22点.pkl\")\n",
    "# df.equals(o_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c4376af-7675-4764-aed1-d3315604a62f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
